Commit 43032fba authored by Nathalie Furmento's avatar Nathalie Furmento
Browse files

website: mpi for tutorials/2014-05-PATC

git-svn-id: svn+ssh://scm.gforge.inria.fr/svn/starpu/website@12911 176f6dd6-97d6-42f4-bd05-d3db9ad07c7a
parent d9604e26
CFLAGS += $(shell pkg-config --cflags starpumpi-1.1)
LDFLAGS += $(shell pkg-config --libs starpumpi-1.1)
CC=mpicc
ring_async_implicit: ring_async_implicit.o
stencil5: stencil5.o
clean:
rm -f ring_async_implicit stencil5 *.o
#how many nodes and cores
#PBS -W x=NACCESSPOLICY:SINGLEJOB -q mirage -l nodes=1:ppn=12
make ring_async_implicit
mpirun -np 2 $PWD/ring_async_implicit
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010 Université de Bordeaux 1
* Copyright (C) 2010, 2011, 2012, 2013, 2014 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu_mpi.h>
#define NITER 32
int token = 42;
starpu_data_handle_t token_handle;
void increment_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
int *tokenptr = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
(*tokenptr)++;
}
static struct starpu_codelet increment_cl =
{
.cpu_funcs = {increment_cpu, NULL},
.nbuffers = 1,
.modes = {STARPU_RW}
};
void increment_token(void)
{
struct starpu_task *task = starpu_task_create();
task->cl = &increment_cl;
task->handles[0] = token_handle;
int ret = starpu_task_submit(task);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
}
int main(int argc, char **argv)
{
int ret, rank, size;
ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
ret = starpu_mpi_init(NULL, NULL, 1);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init");
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
if (size < 2)
{
if (rank == 0)
fprintf(stderr, "We need at least 2 processes.\n");
MPI_Finalize();
return 77;
}
starpu_vector_data_register(&token_handle, 0, (uintptr_t)&token, 1, sizeof(token));
int nloops = NITER;
int loop;
int last_loop = nloops - 1;
int last_rank = size - 1;
for (loop = 0; loop < nloops; loop++)
{
int tag = loop*size + rank;
if (loop == 0 && rank == 0)
{
token = 0;
fprintf(stdout, "Start with token value %u\n", token);
}
else
{
starpu_mpi_irecv_detached(token_handle, (rank+size-1)%size, tag, MPI_COMM_WORLD, NULL, NULL);
}
increment_token();
if (loop == last_loop && rank == last_rank)
{
starpu_data_acquire(token_handle, STARPU_R);
fprintf(stdout, "Finished : token value %u\n", token);
starpu_data_release(token_handle);
}
else
{
starpu_mpi_isend_detached(token_handle, (rank+1)%size, tag+1, MPI_COMM_WORLD, NULL, NULL);
}
}
starpu_task_wait_for_all();
starpu_data_unregister(token_handle);
starpu_mpi_shutdown();
starpu_shutdown();
if (rank == last_rank)
{
fprintf(stderr, "[%d] token = %u == %u * %d ?\n", rank, token, nloops, size);
STARPU_ASSERT(token == nloops*size);
}
return 0;
}
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2011, 2012, 2013, 2014 Centre National de la Recherche Scientifique
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu_mpi.h>
#include <math.h>
void stencil5_cpu(void *descr[], STARPU_ATTRIBUTE_UNUSED void *_args)
{
unsigned *xy = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[0]);
unsigned *xm1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[1]);
unsigned *xp1y = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[2]);
unsigned *xym1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[3]);
unsigned *xyp1 = (unsigned *)STARPU_VARIABLE_GET_PTR(descr[4]);
//fprintf(stdout, "VALUES: %d %d %d %d %d\n", *xy, *xm1y, *xp1y, *xym1, *xyp1);
*xy = (*xy + *xm1y + *xp1y + *xym1 + *xyp1) / 5;
}
struct starpu_codelet stencil5_cl =
{
.cpu_funcs = {stencil5_cpu, NULL},
.nbuffers = 5,
.modes = {STARPU_RW, STARPU_R, STARPU_R, STARPU_R, STARPU_R}
};
#define NITER_DEF 10
#define X 7
#define Y 7
int display = 0;
int niter = NITER_DEF;
/* Returns the MPI node number where data indexes index is */
int my_distrib(int x, int y, int nb_nodes)
{
/* Block distrib */
return ((int)(x / sqrt(nb_nodes) + (y / sqrt(nb_nodes)) * sqrt(nb_nodes))) % nb_nodes;
}
/* Shifted distribution, for migration example */
int my_distrib2(int x, int y, int nb_nodes)
{
return (my_distrib(x, y, nb_nodes) + 1) % nb_nodes;
}
static void parse_args(int argc, char **argv)
{
int i;
for (i = 1; i < argc; i++)
{
if (strcmp(argv[i], "-iter") == 0)
{
char *argptr;
niter = strtol(argv[++i], &argptr, 10);
}
if (strcmp(argv[i], "-display") == 0)
{
display = 1;
}
}
}
int main(int argc, char **argv)
{
int my_rank, size, x, y, loop;
int value=0, mean=0;
unsigned matrix[X][Y];
starpu_data_handle_t data_handles[X][Y];
int ret = starpu_init(NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
starpu_mpi_init(&argc, &argv, 1);
MPI_Comm_rank(MPI_COMM_WORLD, &my_rank);
MPI_Comm_size(MPI_COMM_WORLD, &size);
parse_args(argc, argv);
/* Initial data values */
for(x = 0; x < X; x++)
{
for (y = 0; y < Y; y++)
{
matrix[x][y] = (my_rank+1)*10 + value;
value++;
mean += matrix[x][y];
}
}
mean /= value;
/* Initial distribution */
for(x = 0; x < X; x++)
{
for (y = 0; y < Y; y++)
{
int mpi_rank = my_distrib(x, y, size);
if (mpi_rank == my_rank)
{
//fprintf(stderr, "[%d] Owning data[%d][%d]\n", my_rank, x, y);
starpu_variable_data_register(&data_handles[x][y], 0, (uintptr_t)&(matrix[x][y]), sizeof(unsigned));
}
else if (my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
|| my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size))
{
/* I don't own that index, but will need it for my computations */
//fprintf(stderr, "[%d] Neighbour of data[%d][%d]\n", my_rank, x, y);
starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
}
else
{
/* I know it's useless to allocate anything for this */
data_handles[x][y] = NULL;
}
if (data_handles[x][y])
{
starpu_data_set_rank(data_handles[x][y], mpi_rank);
starpu_data_set_tag(data_handles[x][y], (y*X)+x);
}
}
}
/* First computation with initial distribution */
for(loop=0 ; loop<niter; loop++)
{
for (x = 1; x < X-1; x++)
{
for (y = 1; y < Y-1; y++)
{
starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
0);
}
}
}
fprintf(stderr, "Waiting ...\n");
starpu_task_wait_for_all();
/* Now migrate data to a new distribution */
/* First register newly needed data */
for(x = 0; x < X; x++)
{
for (y = 0; y < Y; y++)
{
int mpi_rank = my_distrib2(x, y, size);
if (!data_handles[x][y] && (mpi_rank == my_rank
|| my_rank == my_distrib(x+1, y, size) || my_rank == my_distrib(x-1, y, size)
|| my_rank == my_distrib(x, y+1, size) || my_rank == my_distrib(x, y-1, size)))
{
/* Register newly-needed data */
starpu_variable_data_register(&data_handles[x][y], -1, (uintptr_t)NULL, sizeof(unsigned));
}
if (data_handles[x][y] && mpi_rank != starpu_data_get_rank(data_handles[x][y]))
{
/* Migrate the data */
starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
/* And register new rank of the matrix */
starpu_data_set_rank(data_handles[x][y], mpi_rank);
}
}
}
/* Second computation with new distribution */
for(loop=0 ; loop<niter; loop++)
{
for (x = 1; x < X-1; x++)
{
for (y = 1; y < Y-1; y++)
{
starpu_mpi_insert_task(MPI_COMM_WORLD, &stencil5_cl, STARPU_RW, data_handles[x][y],
STARPU_R, data_handles[x-1][y], STARPU_R, data_handles[x+1][y],
STARPU_R, data_handles[x][y-1], STARPU_R, data_handles[x][y+1],
0);
}
}
}
fprintf(stderr, "Waiting ...\n");
starpu_task_wait_for_all();
/* Unregister data */
for(x = 0; x < X; x++)
{
for (y = 0; y < Y; y++)
{
if (data_handles[x][y])
{
int mpi_rank = my_distrib(x, y, size);
/* Get back data to original place where the user-provided buffer is. */
starpu_mpi_get_data_on_node_detached(MPI_COMM_WORLD, data_handles[x][y], mpi_rank, NULL, NULL);
/* Register original rank of the matrix (although useless) */
starpu_data_set_rank(data_handles[x][y], mpi_rank);
/* And unregister it */
starpu_data_unregister(data_handles[x][y]);
}
}
}
starpu_mpi_shutdown();
starpu_shutdown();
if (display)
{
fprintf(stdout, "[%d] mean=%d\n", my_rank, mean);
for(x = 0; x < X; x++)
{
fprintf(stdout, "[%d] ", my_rank);
for (y = 0; y < Y; y++)
{
fprintf(stdout, "%3u ", matrix[x][y]);
}
fprintf(stdout, "\n");
}
}
return 0;
}
......@@ -400,7 +400,6 @@ whole graph of tasks, and wait for termination.
</div>
<!--
<div class="section">
<h3>Task Scheduling Policy</h3>
<p>
......@@ -424,7 +423,7 @@ policies:
</p>
<tt><pre>
STARPU_BUS_STATS=1 STARPU_WORKER_STATS=1 [PATH]/examples/mult/sgemm -x 1024 -y 1024 -z 1024
STARPU_BUS_STATS=1 STARPU_WORKER_STATS=1 gemm/sgemm -x 1024 -y 1024 -z 1024
</pre></tt>
<p>
......@@ -432,12 +431,12 @@ with:
</p>
<tt><pre>
STARPU_BUS_STATS=1 STARPU_WORKER_STATS=1 STARPU_SCHED=dmda [PATH]/examples/mult/sgemm -x 1024 -y 1024 -z 1024
STARPU_BUS_STATS=1 STARPU_WORKER_STATS=1 STARPU_SCHED=dmda gemm/sgemm -x 1024 -y 1024 -z 1024
</pre></tt>
<p>
There are much less data transfers, and StarPU realizes that there is no
point in giving tasks to GPUs, resulting to better performance.
You can see most (all?) the computation have been done on GPUs,
leading to better performances.
</p>
<p>
......@@ -455,13 +454,13 @@ less great performance.
</p>
</div>
-->
<!--
<div class="section">
<h3>Performance Model Calibration</h3>
<p>Performance prediction is essential for proper scheduling decisions, the
<p>
Performance prediction is essential for proper scheduling decisions, the
performance models thus have to be calibrated. This is done automatically by
StarPU when a codelet is executed for the first time. Once this is done, the
result is saved to a file in <tt>$HOME</tt> for later re-use. The
......@@ -503,31 +502,51 @@ performance model thus has to be recalibrated from start. To do so, use
<div class="section">
<h2>Sessions Part 3: MPI Support</h2>
<!--
<p>StarPU provides support for MPI communications. Basically, it provides
<p>
StarPU provides support for MPI communications. Basically, it provides
equivalents of <tt>MPI_*</tt> functions, but which operate on DSM handles
instead of <tt>void*</tt> buffers. The difference is that the source data may be
residing on a GPU where it just got computed. StarPU will automatically handle
copying it back to main memory before submitting it to MPI.
</p>
<p><tt>mpi/tests/ring_async_implicit.c</tt> shows an example of mixing MPI communications and task submission. It is a classical ring MPI ping-pong, but the token which is being passed on from neighbour to neighbour is incremented by a starpu task at each step.
<p>
<a href="files/mpi/ring_async_implicit.c"><tt>ring_async_implicit.c</tt></a>
shows an example of mixing MPI communications and task submission. It
is a classical ring MPI ping-pong, but the token which is being passed
on from neighbour to neighbour is incremented by a starpu task at each
step.
</p>
<p>
This is written very naturally by simply submitting all MPI
communication requests and task submission asynchronously in a
sequential-looking loop, and eventually waiting for all the tasks to
complete.
</p>
<p>This is written very naturally by simply submitting all MPI communication requests and task submission asynchronously in a sequential-looking loop, and eventually waiting for all the tasks to complete.</p>
<tt><pre>
#how many nodes and cores
#PBS -W x=NACCESSPOLICY:SINGLEJOB -q mirage -l nodes=1:ppn=12
make ring_async_implicit
mpirun -np 2 $PWD/ring_async_implicit
</pre></tt>
</div>
<div class="section">
<h3>starpu_mpi_insert_task</h3>
<p>The Cholesky factorization shown in the presentation slides is available in
<tt>mpi/examples/cholesky/mpi_cholesky.c</tt>. The data distribution over MPI
<p>
<a href="files/mpi/stencil5.c">A stencil application</a> shows a basic MPI
task model application. The data distribution over MPI
nodes is decided by the <tt>my_distrib</tt> function, and can thus be changed
trivially.</p>
trivially.
It also shows how data can be migrated to a
new distribution.
</p>
</div>
-->
</div>
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment