Bad data exchange with MPI + data filtering
Steps to reproduce
I modified mpi/examples/filters.c so that it takes 2 vectors in parameters (I changes the operation to be v1+= factorv2 and conversely v2 += factorv1 ) V1 is allocated on P1 v2 on P2. Half of the computation happens on proc 0 the other on proc 1. At the end the data is good for first half of V1, wrong otherwise. Here is the patch.
diff --git a/mpi/examples/filters/filter.c b/mpi/examples/filters/filter.c
index 4a8c0caf6..08a7c50b9 100644
--- a/mpi/examples/filters/filter.c
+++ b/mpi/examples/filters/filter.c
@@ -33,19 +33,25 @@ void cpu_func(void *buffers[], void *cl_arg)
starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
fprintf(stderr, "computing on rank %d\n", rank);
unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
- int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+ int *val1 = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+ int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
starpu_codelet_unpack_args(cl_arg, &factor);
- for (i = 0; i < n; i++)
- val[i] *= factor;
+ for (i = 0; i < n; i++) {
+ int v1 =val1[i];
+ int v2 =val2[i];
+ fprintf(stderr, "v1 %d v2 %d\n", v1, v2);
+ val1[i] += factor*v2;
+ val2[i] += factor*v1;
+ }
}
struct starpu_codelet cl =
{
.cpu_funcs = {cpu_func},
.cpu_funcs_name = {"cpu_func"},
- .nbuffers = 1,
- .modes = {STARPU_RW},
+ .nbuffers = 2,
+ .modes = {STARPU_RW, STARPU_RW},
.name = "vector_scal"
};
@@ -78,10 +84,10 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
int main(int argc, char **argv)
{
int i, rank, nodes;
- int vector[NX];
- int vector_check[NX];
- starpu_data_handle_t vhandle;
- starpu_data_handle_t handles[2];
+ int vector[2][NX];
+ int vector_check[2][NX];
+ starpu_data_handle_t vhandle[2];
+ starpu_data_handle_t handles[4];
int factor[2] = {2, 3};
int ret;
@@ -105,21 +111,33 @@ int main(int argc, char **argv)
for(i=0 ; i<NX ; i++)
{
- vector[i] = i+1;
- if (i < NX/2)
- vector_check[i] = vector[i]*factor[0];
- else
- vector_check[i] = vector[i]*factor[1];
+ vector[0][i] = i+1;
+ vector[1][i] = NX+i+1;
+ if (i < NX/2) {
+ vector_check[0][i] = vector[0][i]+vector[1][i]*factor[0];
+ vector_check[1][i] = vector[1][i]+vector[0][i]*factor[0];
+ } else {
+ vector_check[0][i] = vector[0][i]+vector[1][i]*factor[1];
+ vector_check[1][i] = vector[1][i]+vector[0][i]*factor[1];
+ }
+ }
+ for (int j = 0; j < 2; j++) {
+ FPRINTF(stderr,"IN Vector[%d]: ", j);
+ for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[j][i]);
+ FPRINTF(stderr,"\n");
}
- FPRINTF(stderr,"IN Vector: ");
- for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
- FPRINTF(stderr,"\n");
/* Declare data to StarPU */
- if (rank == 0)
- starpu_vector_data_register(&vhandle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
- else
- starpu_vector_data_register(&vhandle, -1, (uintptr_t)NULL, NX, sizeof(vector[0]));
+ if (rank == 0) {
+ starpu_vector_data_register(vhandle, STARPU_MAIN_RAM, (uintptr_t)vector[0], NX, sizeof(vector[0]));
+ starpu_vector_data_register(vhandle+1, -1, (uintptr_t)NULL, NX, sizeof(vector[1]));
+ } else {
+ starpu_vector_data_register(vhandle, -1, (uintptr_t)NULL, NX, sizeof(vector[0]));
+ starpu_vector_data_register(vhandle+1, STARPU_MAIN_RAM, (uintptr_t)vector[1], NX, sizeof(vector[1]));
+ }
+ for (int j = 0; j < 2; j++) {
+ starpu_mpi_data_register(vhandle[j], 66+j, j);
+ }
/* Partition the vector in PARTS sub-vectors */
struct starpu_data_filter f =
@@ -127,44 +145,52 @@ int main(int argc, char **argv)
.filter_func = vector_filter,
.nchildren = 2
};
- starpu_data_partition_plan(vhandle, &f, handles);
- starpu_data_partition_submit(vhandle, 2, handles);
+ starpu_data_partition_plan(vhandle[0], &f, handles);
+ starpu_data_partition_submit(vhandle[0], 2, handles);
+ starpu_data_partition_plan(vhandle[1], &f, handles+2);
+ starpu_data_partition_submit(vhandle[1], 2, handles+2);
/* Submit a task on each sub-vector */
for (i=0; i<2; i++)
{
starpu_mpi_data_register(handles[i], 42+i, 0);
+ starpu_mpi_data_register(handles[i+2], 42+i+2, 1);
ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
&cl,
STARPU_RW, handles[i],
+ STARPU_RW, handles[i+2],
STARPU_VALUE, &factor[i], sizeof(factor[i]),
- STARPU_EXECUTE_ON_NODE, 1,
+ STARPU_EXECUTE_ON_NODE, i,
0);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
}
- starpu_data_unpartition_submit(vhandle, 2, handles, -1);
- starpu_data_partition_clean(vhandle, 2, handles);
+ starpu_data_unpartition_submit(vhandle[0], 2, handles, -1);
+ starpu_data_partition_clean(vhandle[0], 2, handles);
+ starpu_data_unpartition_submit(vhandle[1], 2, handles+2, -1);
+ starpu_data_partition_clean(vhandle[1], 2, handles+2);
int ok=0;
if (rank == 0)
{
- starpu_data_acquire(vhandle, STARPU_R);
- int *v = starpu_data_get_local_ptr(vhandle);
- FPRINTF(stderr,"OUT Vector: ");
- for(i=0 ; i<NX ; i++)
- {
- FPRINTF(stderr, "%5d ", v[i]);
- if (v[i] != vector_check[i])
+ for (int j = 0; j < 2; j++) {
+ starpu_data_acquire(vhandle[j], STARPU_R);
+ int *v = starpu_data_get_local_ptr(vhandle[j]);
+ FPRINTF(stderr,"OUT Vector[%d]: ", j);
+ for(i=0 ; i<NX ; i++)
{
- FPRINTF(stderr, "%5d should be %5d\n", v[i], vector_check[i]);
- ok=1;
+ FPRINTF(stderr, "%5d ", v[i]);
+ if (v[i] != vector_check[j][i])
+ {
+ FPRINTF(stderr, "%5d should be %5d\n", v[i], vector_check[j][i]);
+ ok=1;
+ }
}
+ FPRINTF(stderr,"\n");
+ starpu_data_release(vhandle[j]);
}
- FPRINTF(stderr,"\n");
- starpu_data_release(vhandle);
}
-
- starpu_data_unregister(vhandle);
+ starpu_data_unregister(vhandle[0]);
+ starpu_data_unregister(vhandle[1]);
starpu_mpi_shutdown();
return ok;
Obtained behavior
Check fails on 2nd half of vector[0] and all vector[1].
Expected behavior
All check passes
Configuration
/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel20.4.304_hpcx --enable-icc --with-mpicc=mpicc --disable-cuda --disable-opencl --enable-maxnumanodes=9 --disable-parallel-worker --disable-simgrid --disable-build-examples
Distribution
RHEL 8.6
Version of StarPU
git d399d2c6
Version of GPU drivers
No GPU