Bad data exchange with MPI + data filtering

Steps to reproduce

I modified mpi/examples/filters.c so that it takes 2 vectors in parameters (I changes the operation to be v1+= factorv2 and conversely v2 += factorv1 ) V1 is allocated on P1 v2 on P2. Half of the computation happens on proc 0 the other on proc 1. At the end the data is good for first half of V1, wrong otherwise. Here is the patch.

diff --git a/mpi/examples/filters/filter.c b/mpi/examples/filters/filter.c
index 4a8c0caf6..08a7c50b9 100644
--- a/mpi/examples/filters/filter.c
+++ b/mpi/examples/filters/filter.c
@@ -33,19 +33,25 @@ void cpu_func(void *buffers[], void *cl_arg)
        starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
        fprintf(stderr, "computing on rank %d\n", rank);
        unsigned n = STARPU_VECTOR_GET_NX(buffers[0]);
-       int *val = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+       int *val1 = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
+       int *val2 = (int *)STARPU_VECTOR_GET_PTR(buffers[1]);
        starpu_codelet_unpack_args(cl_arg, &factor);
 
-       for (i = 0; i < n; i++)
-               val[i] *= factor;
+       for (i = 0; i < n; i++) {
+               int v1 =val1[i];
+               int v2 =val2[i];
+               fprintf(stderr, "v1 %d v2 %d\n", v1, v2);
+               val1[i] += factor*v2;
+               val2[i] += factor*v1;
+       }
 }
 
 struct starpu_codelet cl =
 {
        .cpu_funcs = {cpu_func},
        .cpu_funcs_name = {"cpu_func"},
-       .nbuffers = 1,
-       .modes = {STARPU_RW},
+       .nbuffers = 2,
+       .modes = {STARPU_RW, STARPU_RW},
        .name = "vector_scal"
 };
 
@@ -78,10 +84,10 @@ void vector_filter(void *father_interface, void *child_interface, struct starpu_
 int main(int argc, char **argv)
 {
        int i, rank, nodes;
-       int vector[NX];
-       int vector_check[NX];
-       starpu_data_handle_t vhandle;
-       starpu_data_handle_t handles[2];
+       int vector[2][NX];
+       int vector_check[2][NX];
+       starpu_data_handle_t vhandle[2];
+       starpu_data_handle_t handles[4];
        int factor[2] = {2, 3};
        int ret;
 
@@ -105,21 +111,33 @@ int main(int argc, char **argv)
 
        for(i=0 ; i<NX ; i++)
        {
-               vector[i] = i+1;
-               if (i < NX/2)
-                       vector_check[i] = vector[i]*factor[0];
-               else
-                       vector_check[i] = vector[i]*factor[1];
+               vector[0][i] = i+1;
+               vector[1][i] = NX+i+1;
+               if (i < NX/2) {
+                       vector_check[0][i] = vector[0][i]+vector[1][i]*factor[0];
+                       vector_check[1][i] = vector[1][i]+vector[0][i]*factor[0];
+               } else {
+                       vector_check[0][i] = vector[0][i]+vector[1][i]*factor[1];
+                       vector_check[1][i] = vector[1][i]+vector[0][i]*factor[1];
+               }
+       }
+       for (int j = 0; j < 2; j++) {
+               FPRINTF(stderr,"IN  Vector[%d]: ", j);
+               for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[j][i]);
+               FPRINTF(stderr,"\n");
        }
-       FPRINTF(stderr,"IN  Vector: ");
-       for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
-       FPRINTF(stderr,"\n");
 
        /* Declare data to StarPU */
-       if (rank == 0)
-               starpu_vector_data_register(&vhandle, STARPU_MAIN_RAM, (uintptr_t)vector, NX, sizeof(vector[0]));
-       else
-               starpu_vector_data_register(&vhandle, -1, (uintptr_t)NULL, NX, sizeof(vector[0]));
+       if (rank == 0) {
+               starpu_vector_data_register(vhandle, STARPU_MAIN_RAM, (uintptr_t)vector[0], NX, sizeof(vector[0]));
+               starpu_vector_data_register(vhandle+1, -1, (uintptr_t)NULL, NX, sizeof(vector[1]));
+       } else {
+               starpu_vector_data_register(vhandle, -1, (uintptr_t)NULL, NX, sizeof(vector[0]));
+               starpu_vector_data_register(vhandle+1, STARPU_MAIN_RAM, (uintptr_t)vector[1], NX, sizeof(vector[1]));
+       }
+       for (int j = 0; j < 2; j++) {
+               starpu_mpi_data_register(vhandle[j], 66+j, j);
+       }
 
        /* Partition the vector in PARTS sub-vectors */
        struct starpu_data_filter f =
@@ -127,44 +145,52 @@ int main(int argc, char **argv)
                .filter_func = vector_filter,
                .nchildren = 2
        };
-       starpu_data_partition_plan(vhandle, &f, handles);
-       starpu_data_partition_submit(vhandle, 2, handles);
+       starpu_data_partition_plan(vhandle[0], &f, handles);
+       starpu_data_partition_submit(vhandle[0], 2, handles);
+       starpu_data_partition_plan(vhandle[1], &f, handles+2);
+       starpu_data_partition_submit(vhandle[1], 2, handles+2);
 
        /* Submit a task on each sub-vector */
        for (i=0; i<2; i++)
        {
                starpu_mpi_data_register(handles[i], 42+i, 0);
+               starpu_mpi_data_register(handles[i+2], 42+i+2, 1);
                ret = starpu_mpi_task_insert(MPI_COMM_WORLD,
                                             &cl,
                                             STARPU_RW, handles[i],
+                                            STARPU_RW, handles[i+2],
                                             STARPU_VALUE, &factor[i], sizeof(factor[i]),
-                                            STARPU_EXECUTE_ON_NODE, 1,
+                                            STARPU_EXECUTE_ON_NODE, i,
                                             0);
                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
        }
 
-       starpu_data_unpartition_submit(vhandle, 2, handles, -1);
-       starpu_data_partition_clean(vhandle, 2, handles);
+       starpu_data_unpartition_submit(vhandle[0], 2, handles, -1);
+       starpu_data_partition_clean(vhandle[0], 2, handles);
+       starpu_data_unpartition_submit(vhandle[1], 2, handles+2, -1);
+       starpu_data_partition_clean(vhandle[1], 2, handles+2);
        int ok=0;
        if (rank == 0)
        {
-               starpu_data_acquire(vhandle, STARPU_R);
-               int *v = starpu_data_get_local_ptr(vhandle);
-               FPRINTF(stderr,"OUT Vector: ");
-               for(i=0 ; i<NX ; i++)
-               {
-                       FPRINTF(stderr, "%5d ", v[i]);
-                       if (v[i] != vector_check[i])
+               for (int j = 0; j < 2; j++) {
+                       starpu_data_acquire(vhandle[j], STARPU_R);
+                       int *v = starpu_data_get_local_ptr(vhandle[j]);
+                       FPRINTF(stderr,"OUT Vector[%d]: ", j);
+                       for(i=0 ; i<NX ; i++)
                        {
-                               FPRINTF(stderr, "%5d should be %5d\n", v[i], vector_check[i]);
-                               ok=1;
+                               FPRINTF(stderr, "%5d ", v[i]);
+                               if (v[i] != vector_check[j][i])
+                               {
+                                       FPRINTF(stderr, "%5d should be %5d\n", v[i], vector_check[j][i]);
+                                       ok=1;
+                               }
                        }
+                       FPRINTF(stderr,"\n");
+                       starpu_data_release(vhandle[j]);
                }
-               FPRINTF(stderr,"\n");
-               starpu_data_release(vhandle);
        }
-
-       starpu_data_unregister(vhandle);
+       starpu_data_unregister(vhandle[0]);
+       starpu_data_unregister(vhandle[1]);
        starpu_mpi_shutdown();
 
        return ok;

Obtained behavior

Check fails on 2nd half of vector[0] and all vector[1].

Expected behavior

All check passes

Configuration

/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel20.4.304_hpcx --enable-icc --with-mpicc=mpicc --disable-cuda --disable-opencl --enable-maxnumanodes=9 --disable-parallel-worker --disable-simgrid --disable-build-examples

Distribution

RHEL 8.6

Version of StarPU

git d399d2c6

Version of GPU drivers

No GPU

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message