Commit 1b5be09d authored by Nathalie Furmento's avatar Nathalie Furmento
Browse files

website: tutorial prace

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/starpu/website@12898 176f6dd6-97d6-42f4-bd05-d3db9ad07c7a
parent 8d63c1bb
CFLAGS += $(shell pkg-config --cflags starpu-1.1)
LDFLAGS += $(shell pkg-config --libs starpu-1.1)
vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
%.o: %.cu
nvcc $(CFLAGS) $< -c $
clean:
rm -f vector_scal *.o
#how many nodes and cores
#PBS -W x=NACCESSPOLICY:SINGLEJOB -q mirage -l nodes=1:ppn=12
make
vector_scal_task_insert
# to force the implementation on a GPU device, by default, it will enable CUDA
# STARPU_NCPUS=0 vector_scal_task_insert
# to force the implementation on a OpenCL device
# STARPU_NCPUS=0 STARPU_NCUDA=0 vector_scal_task_insert
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010, 2011 Université de Bordeaux 1
* Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu.h>
/* This kernel takes a buffer and scales it by a constant factor */
void vector_scal_cpu(void *buffers[], void *cl_arg)
{
unsigned i;
float *factor = cl_arg;
/*
* The "buffers" array matches the task->handles array: for instance
* task->handles[0] is a handle that corresponds to a data with
* vector "interface", so that the first entry of the array in the
* codelet is a pointer to a structure describing such a vector (ie.
* struct starpu_vector_interface *). Here, we therefore manipulate
* the buffers[0] element as a vector: nx gives the number of elements
* in the array, ptr gives the location of the array (that was possibly
* migrated/replicated), and elemsize gives the size of each elements.
*/
struct starpu_vector_interface *vector = buffers[0];
/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(vector);
/* get a pointer to the local copy of the vector : note that we have to
* cast it in (float *) since a vector could contain any type of
* elements so that the .ptr field is actually a uintptr_t */
float *val = (float *)STARPU_VECTOR_GET_PTR(vector);
/* scale the vector */
for (i = 0; i < n; i++)
val[i] *= *factor;
}
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010, 2011 Université de Bordeaux 1
* Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu.h>
static __global__ void vector_mult_cuda(float *val, unsigned int n, float factor)
{
unsigned i = blockIdx.x*blockDim.x + threadIdx.x;
if (i < n)
val[i] *= factor;
}
extern "C" void vector_scal_cuda(void *buffers[], void *_args)
{
float *factor = (float *)_args;
/* length of the vector */
unsigned int n = STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
float *val = (float *)STARPU_VECTOR_GET_PTR(buffers[0]);
unsigned threads_per_block = 64;
unsigned nblocks = (n + threads_per_block-1) / threads_per_block;
vector_mult_cuda<<<nblocks,threads_per_block, 0, starpu_cuda_get_local_stream()>>>(val, n, *factor);
cudaStreamSynchronize(starpu_cuda_get_local_stream());
}
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010, 2011 Université de Bordeaux 1
* Copyright (C) 2010, 2011, 2012, 2013 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu.h>
extern struct starpu_opencl_program programs;
void vector_scal_opencl(void *buffers[], void *_args)
{
float *factor = _args;
int id, devid, err;
cl_kernel kernel;
cl_command_queue queue;
cl_event event;
/* length of the vector */
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
/* OpenCL copy of the vector pointer */
cl_mem val = (cl_mem) STARPU_VECTOR_GET_DEV_HANDLE(buffers[0]);
id = starpu_worker_get_id();
devid = starpu_worker_get_devid(id);
err = starpu_opencl_load_kernel(&kernel, &queue, &programs,
"vector_mult_opencl", devid); /* Name of the codelet defined above */
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
err = clSetKernelArg(kernel, 0, sizeof(val), &val);
err |= clSetKernelArg(kernel, 1, sizeof(n), &n);
err |= clSetKernelArg(kernel, 2, sizeof(*factor), factor);
if (err) STARPU_OPENCL_REPORT_ERROR(err);
{
size_t global=1;
size_t local=1;
err = clEnqueueNDRangeKernel(queue, kernel, 1, NULL, &global, &local, 0, NULL, &event);
if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
}
clFinish(queue);
starpu_opencl_collect_stats(event);
clReleaseEvent(event);
starpu_opencl_release_kernel(kernel);
}
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010, 2011, 2013 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
__kernel void vector_mult_opencl(unsigned int nx, __global float* val, float factor)
{
const int i = get_global_id(0);
if (i < nx)
{
val[i] *= factor;
}
}
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010, 2011, 2012, 2013, 2014 Centre National de la Recherche Scientifique
* Copyright (C) 2010-2012 Université de Bordeaux 1
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
/*
* This example demonstrates how to use StarPU to scale an array by a factor.
* It shows how to manipulate data with StarPU's data management library.
* 1- how to declare a piece of data to StarPU (starpu_vector_data_register)
* 2- how to submit a task to StarPU
* 3- how a kernel can manipulate the data (buffers[0].vector.ptr)
*/
#include <starpu.h>
#define NX 2048
extern void vector_scal_cpu(void *buffers[], void *_args);
extern void vector_scal_cuda(void *buffers[], void *_args);
extern void vector_scal_opencl(void *buffers[], void *_args);
static struct starpu_codelet cl = {
/* CPU implementation of the codelet */
.cpu_funcs = {vector_scal_cpu, NULL},
#ifdef STARPU_USE_CUDA
/* CUDA implementation of the codelet */
.cuda_funcs = {vector_scal_cuda, NULL},
#endif
#ifdef STARPU_USE_OPENCL
/* OpenCL implementation of the codelet */
.opencl_funcs = {vector_scal_opencl, NULL},
#endif
.nbuffers = 1,
.modes = {STARPU_RW}
};
#ifdef STARPU_USE_OPENCL
struct starpu_opencl_program programs;
#endif
int main(int argc, char **argv)
{
/* We consider a vector of float that is initialized just as any of C
* data */
float vector[NX];
unsigned i;
for (i = 0; i < NX; i++)
vector[i] = 1.0f;
fprintf(stderr, "BEFORE : First element was %f\n", vector[0]);
/* Initialize StarPU with default configuration */
int ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
#ifdef STARPU_USE_OPENCL
starpu_opencl_load_opencl_from_file("vector_scal_opencl_kernel.cl", &programs, NULL);
#endif
/* Tell StaPU to associate the "vector" vector with the "vector_handle"
* identifier. When a task needs to access a piece of data, it should
* refer to the handle that is associated to it.
* In the case of the "vector" data interface:
* - the first argument of the registration method is a pointer to the
* handle that should describe the data
* - the second argument is the memory node where the data (ie. "vector")
* resides initially: STARPU_MAIN_RAM stands for an address in main memory, as
* opposed to an adress on a GPU for instance.
* - the third argument is the adress of the vector in RAM
* - the fourth argument is the number of elements in the vector
* - the fifth argument is the size of each element.
*/
starpu_data_handle_t vector_handle;
starpu_vector_data_register(&vector_handle, STARPU_MAIN_RAM, (uintptr_t)vector,
NX, sizeof(vector[0]));
float factor = 3.14;
ret = starpu_task_insert(&cl,
/* an argument is passed to the codelet, beware that this is a
* READ-ONLY buffer and that the codelet may be given a pointer to a
* COPY of the argument */
STARPU_VALUE, &factor, sizeof(factor),
/* the codelet manipulates one buffer in RW mode */
STARPU_RW, vector_handle,
0);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_insert");
/* Wait for tasks completion */
starpu_task_wait_for_all();
/* StarPU does not need to manipulate the array anymore so we can stop
* monitoring it */
starpu_data_unregister(vector_handle);
#ifdef STARPU_USE_OPENCL
starpu_opencl_unload_opencl(&programs);
#endif
/* terminate StarPU, no task can be submitted after */
starpu_shutdown();
fprintf(stderr, "AFTER First element is %f\n", vector[0]);
return 0;
}
......@@ -48,7 +48,7 @@ Jobs can be submitted to the platform to reserve a set of nodes and to
execute a application on these nodes. We advise not to reserve nodes
interactively so as not to block the machines for the others
participants. Here a script (available
for <a href="starpu_machine_display.pbs">download</a>) to submit your
for <a href="files/starpu_machine_display.pbs">download</a>) to submit your
first StarPU application. It calls the
tool <tt>starpu_machine_display</tt> which shows the processing units
that StarPU can use, and the bandwitdh and affinity measured between
......@@ -102,69 +102,108 @@ hostnames for the different node classes, as appropriate.
</div>
<!--
<div class="section">
<h2>Hands-on session part 1: Task-based programming model</h2>
<h2>Session part 1: Task-based programming model</h2>
<div class="section">
<h3>Application example: vector scaling</h3>
<h4>Making it and running it</h4>
<p>A typical <tt>Makefile</tt> for applications using StarPU is then the
following (<a href=Makefile>available for download</a>):</p>
<p>
A typical <tt>Makefile</tt> for applications using StarPU is then the
following (<a href="files/Makefile">available for download</a>):
</p>
<tt><pre>
CFLAGS += $(shell pkg-config --cflags starpu-1.1)
LDFLAGS += $(shell pkg-config --libs starpu-1.1)
vector_scal: vector_scal.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
vector_scal_task_insert: vector_scal_task_insert.o vector_scal_cpu.o vector_scal_cuda.o vector_scal_opencl.o
%.o: %.cu
nvcc $(CFLAGS) $< -c $
clean:
rm -f vector_scal *.o
rm -f vector_scal_task_insert *.o
</pre></tt>
<p>Copy the <tt>vector_scal*.c*</tt> files and the
<tt>vector_scal_cpu_template.h</tt> file from <tt>examples/basic_examples</tt>
into a new empty directory, along with the <tt>Makefile</tt> mentioned above.
Run <tt>make</tt>, and run the resulting <tt>vector_scal</tt> executable using
the batch scheduler. It should be working: it simply scales a given vector by a
given factor.</p>
<p>
Download the following files along with the <tt>Makefile</tt>
mentioned above.
<ul>
<li><a href="files/vector_scal_task_insert.c">The main application</a></li>
<li><a href="files/vector_scal_cpu.c">The CPU implementation of the codelet</a></li>
<li><a href="files/vector_scal_cuda.cu">The CUDA implementation of the codelet</a></li>
<li><a href="files/vector_scal_opencl.c">The OpenCL host implementation of the codelet</a></li>
<li><a href="files/vector_scal_opencl_kernel.cl">The OpenCL device implementation of the codelet</a></li>
</ul>
Run <tt>make</tt>, and run the
resulting <tt>vector_scal_task_insert</tt> executable using the batch
scheduler using the <a href="files/vector_scal.pbs">given qsub script</a>. It should be working: it simply scales a given vector by a
given factor.
</p>
<h4>Computation kernels</h4>
<p>Examine the source code, starting from <tt>vector_scal_cpu.c</tt> : this is
<p>
Examine the source code, starting from <tt>vector_scal_cpu.c</tt> : this is
the actual computation code, which is wrapped into a <tt>scal_cpu_func</tt>
function which takes a series of DSM interfaces and a non-DSM parameter. The
code simply gets an actual pointer from the first DSM interface, and the factor
value from the non-DSM parameter, and performs the vector scaling.</p>
value from the non-DSM parameter, and performs the vector scaling.
</p>
<p>The GPU implementation, in <tt>vector_scal_cuda.cu</tt>, is basically
<p>
The GPU implementation, in <tt>vector_scal_cuda.cu</tt>, is basically
the same, with the host part (<tt>scal_cuda_func</tt>) which extracts the
actual CUDA pointer from the DSM interface, and passes it to the device part
(<tt>vector_mult_cuda</tt>) which performs the actual computation.</p>
(<tt>vector_mult_cuda</tt>) which performs the actual computation.
</p>
<p>
The OpenCL implementation is more hairy due to the low-level aspect of the
OpenCL standard, but the principle remains the same.
</p>
<p>The OpenCL implementation is more hairy due to the low-level aspect of the
OpenCL standard, but the principle remains the same.</p>
<p>
Modify the source code of the different implementations (CPU, CUDA and
OpenCL) and see which ones gets executed. You can force the execution
of one the implementations simply by disabling a type of device when
running your application, e.g.:
</p>
<tt><pre>
# to force the implementation on a GPU device, by default, it will enable CUDA
STARPU_NCPUS=0 vector_scal_task_insert
# to force the implementation on a OpenCL device
STARPU_NCPUS=0 STARPU_NCUDA=0 vector_scal_task_insert
</pre></tt>
<h4>Main code</h4>
<p>Now examine <tt>vector_scal.c</tt>: the <tt>cl</tt> (codelet) structure simply gathers
pointers on the functions mentioned above. It also includes a performance model.</p>
<p>
Now examine <tt>vector_scal_task_insert.c</tt>: the <tt>cl</tt>
(codelet) structure simply gathers pointers on the functions
mentioned above.
</p>
<p>The <tt>main</tt> function
<p>
The <tt>main</tt> function
<ul>
<li>Allocates an <tt>vector</tt> application buffer and fills it.</li>
<li>Registers it to StarPU, and get back a DSM handle. For now on, the
<li>Registers it to StarPU, and gets back a DSM handle. From now on, the
application is not supposed to access <tt>vector</tt> directly, since its
content may be copied and modified by a task on a GPU, the main-memory copy then
being outdated.</li>
<li>Submits a (synchronous) task to StarPU.</li>
<li>Submits a (asynchronous) task to StarPU.</li>
<li>Waits for task completion.</li>
<li>Unregisters the vector from StarPU, which brings back the modified version
to main memory.</li>
</ul>
</p>
</div>
</div>
<!--
<div class="section">
<h3>Data partitioning</h3>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment