Partitionned data fetched twice on GPU

Steps to reproduce

Modify filters/fmatrix.c so that it performs filtered operations on CPU and 2 same operations using one filtered data and one other data.

See attached diff

diff --git a/examples/filters/fmatrix.c b/examples/filters/fmatrix.c
index c3767362f..97e4d0ecb 100644
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -40,31 +40,120 @@ extern void matrix_hip_func(void *buffers[], void *cl_arg);
 extern void generate_matrix_data(int *matrix, int nx, int ny, unsigned ld);
 extern void print_matrix_data(starpu_data_handle_t matrix_handle);
 
+void matrix2_cuda_func( void *  buffer[], void *cl_arg){
+}
+void starpu_matrix_filter_block2D(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+       struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+       struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
+       unsigned blocksize;
+       /* the element will be split, in case horizontal, it's nx, in case vertical, it's ny*/
+       uint32_t nn;
+       uint32_t nx;
+       uint32_t ny;
+
+       int rootSquareNChunks = 1;
+       while (rootSquareNChunks*rootSquareNChunks < nchunks) rootSquareNChunks++;
+       STARPU_ASSERT_MSG(nchunks == rootSquareNChunks*rootSquareNChunks, "Cannot split in non square number of parts");
+
+       /* actual number of elements */
+       nx = matrix_father->nx;
+       ny = matrix_father->ny;
+       blocksize = matrix_father->ld;
+
+       size_t elemsize = matrix_father->elemsize;
+
+       //STARPU_ASSERT_MSG(nchunks <= nn, "cannot split %u elements in %u parts", nn, nchunks);
+
+       uint32_t child_nx, child_ny;
+       size_t offset;
+
+       starpu_filter_nparts_compute_chunk_size_and_offset(nx, rootSquareNChunks, elemsize, id, 1, &child_nx, &offset);
+       starpu_filter_nparts_compute_chunk_size_and_offset(ny, rootSquareNChunks, elemsize, id, blocksize, &child_ny, &offset);
+
+       STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
+
+       /* update the child's interface */
+       matrix_child->id = matrix_father->id;
+
+       matrix_child->nx = child_nx;
+       matrix_child->ny = child_ny;
+       matrix_child->elemsize = elemsize;
+
+       /* is the information on this node valid ? */
+       if (matrix_father->dev_handle)
+       {
+               if (matrix_father->ptr)
+                       matrix_child->ptr = matrix_father->ptr + offset;
+               matrix_child->ld = matrix_father->ld;
+               matrix_child->dev_handle = matrix_father->dev_handle;
+               matrix_child->offset = matrix_father->offset + offset;
+               matrix_child->allocsize = matrix_child->ld * matrix_child->ny * elemsize;
+       }
+       else
+               matrix_child->allocsize = matrix_child->nx * matrix_child->ny * elemsize;
+}
+
+void matrix_init_cpu_func(void *buffers[], void *cl_arg)
+{
+       int i, j;
+       int *factor = (int *) cl_arg;
+
+       /* length of the matrix */
+       int nx = (int)STARPU_MATRIX_GET_NX(buffers[0]);
+       int ny = (int)STARPU_MATRIX_GET_NY(buffers[0]);
+       unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+       /* local copy of the matrix pointer */
+       int *matrix = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+       generate_matrix_data(matrix, nx, ny, ld);
+}
+
+
 int main(void)
 {
        unsigned j;
-       int *matrix;
+       int *matrix, *matrix2;
        int ret, i;
        int factor = 12;
 
-       starpu_data_handle_t handle;
+       starpu_data_handle_t handle, handle2;
        struct starpu_codelet cl =
        {
                .cpu_funcs = {matrix_cpu_func},
                .cpu_funcs_name = {"matrix_cpu_func"},
-#ifdef STARPU_USE_CUDA
-               .cuda_funcs = {matrix_cuda_func},
-               .cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-#ifdef STARPU_USE_HIP
-               .hip_funcs = {matrix_hip_func},
-               .hip_flags = {STARPU_HIP_ASYNC},
-#endif
+/* #ifdef STARPU_USE_CUDA */
+/*             .cuda_funcs = {matrix_cuda_func}, */
+/*             .cuda_flags = {STARPU_CUDA_ASYNC}, */
+/* #endif */
+/* #ifdef STARPU_USE_HIP */
+/*             .hip_funcs = {matrix_hip_func}, */
+/*             .hip_flags = {STARPU_HIP_ASYNC}, */
+/* #endif */
                .nbuffers = 1,
                .modes = {STARPU_RW},
                .name = "matrix_scal"
        };
 
+       struct starpu_codelet cl2 =
+       {
+               .cuda_funcs = {matrix2_cuda_func},
+               .cuda_flags = {STARPU_CUDA_ASYNC},
+               .nbuffers = 2,
+               .modes = {STARPU_R,STARPU_RW},
+               .name = "matrix_scal"
+       };
+
+       struct starpu_codelet cl_init =
+       {
+               .cpu_funcs = {matrix_init_cpu_func},
+               .cpu_funcs_name = {"matrix_init_cpu_func"},
+               .nbuffers = 1,
+               .modes = {STARPU_W},
+               .name = "matrix_init"
+       };
+
        ret = starpu_init(NULL);
        if (ret == -ENODEV)
                exit(77);
@@ -72,9 +161,19 @@ int main(void)
 
        starpu_malloc((void **)&matrix, NX*NY*sizeof(int));
        generate_matrix_data(matrix, NX, NY, NX);
+       starpu_malloc((void **)&matrix2, NX*NY*sizeof(int));
+       generate_matrix_data(matrix2, NX, NY, NX);
 
        /* Declare data to StarPU */
-       starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
+       starpu_matrix_data_register(&handle, STARPU_MAIN_RAM , (uintptr_t)matrix, NX, NX, NY, sizeof(int));
+       starpu_matrix_data_register(&handle2, STARPU_MAIN_RAM , (uintptr_t)matrix2, NX, NX, NY, sizeof(int));
+       struct starpu_task *task_init = starpu_task_create();
+       task_init->handles[0] = handle;//starpu_data_get_sub_data(handle, 1, i);
+       task_init->cl = &cl_init;
+       task_init->synchronous = 1;
+
+       ret = starpu_task_submit(task_init);
+       
        FPRINTF(stderr,"IN Matrix: \n");
        print_matrix_data(handle);
 
@@ -84,13 +183,22 @@ int main(void)
                .filter_func = starpu_matrix_filter_block,
                .nchildren = PARTS
        };
-       starpu_data_partition(handle, &f);
+       starpu_data_handle_t children[f.nchildren];
+       starpu_data_partition_plan(handle, &f, children);
+
+       struct starpu_data_filter f2 =
+       {
+               .filter_func = starpu_matrix_filter_block2D,
+               .nchildren = PARTS*PARTS
+       };
+       starpu_data_handle_t children2[f2.nchildren];
+       starpu_data_partition_plan(handle, &f2, children2);
 
        /* Submit a task on each sub-vector */
-       for (i=0; i<starpu_data_get_nb_children(handle); i++)
+       for (i=0; i<f2.nchildren; i++)
        {
                struct starpu_task *task = starpu_task_create();
-               task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+               task->handles[0] = children2[i];
                task->cl = &cl;
                task->synchronous = 1;
                task->cl_arg = &factor;
@@ -101,13 +209,54 @@ int main(void)
                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
        }
 
+       
+       /* Submit a task on each sub-vector */
+       for (i=0; i<f.nchildren; i++)
+       {
+               struct starpu_task *task = starpu_task_create();
+               task->handles[0] = children[i];
+               task->cl = &cl;
+               task->synchronous = 1;
+               task->cl_arg = &factor;
+               task->cl_arg_size = sizeof(factor);
+
+               ret = starpu_task_submit(task);
+               if (ret == -ENODEV) goto enodev;
+               STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+       }
+
+       struct starpu_task *task = starpu_task_create();
+       task->handles[0] = children[1];
+       task->handles[1] = handle2;
+       task->cl = &cl2;
+       task->synchronous = 1;
+       task->cl_arg = &factor;
+       task->cl_arg_size = sizeof(factor);
+
+       ret = starpu_task_submit(task);
+       if (ret == -ENODEV) goto enodev;
+       STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+       struct starpu_task *task2 = starpu_task_create();
+       task2->handles[0] = children[1];
+       task2->handles[1] = handle2;
+       task2->cl = &cl2;
+       task2->synchronous = 1;
+       task2->cl_arg = &factor;
+       task2->cl_arg_size = sizeof(factor);
+
+       ret = starpu_task_submit(task2);
+       if (ret == -ENODEV) goto enodev;
+       STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
        /* Unpartition the data, unregister it from StarPU and shutdown */
-       starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+       starpu_data_partition_clean(handle, f.nchildren, children);
+       starpu_data_partition_clean(handle, f2.nchildren, children2);
        FPRINTF(stderr,"OUT Matrix: \n");
        print_matrix_data(handle);
        starpu_data_unregister(handle);
 
-       starpu_free_noflag(matrix, NX*NY*sizeof(int));
+       //starpu_free_noflag(matrix, NX*NY*sizeof(int));
        starpu_shutdown();
 
        return ret;

Obtained behavior

The data is fetched twice on the GPU. See attached trace and capture.[example.trace](/uploads/db004b57c8381c9aeb340d61a2cce95d/example.trace

Expected behavior

The data should be fetch once as it is the same for both GPU tasks

Configuration

$ /home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel18_cuda11.7_hpcx --disable-build-doc --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-opencl --enable-icc --with-mpicc=mpicc --enable-cuda --enable-fxt --enable-maxnumanodes=9 --enable-max-sched-ctxs=32 --with-cuda-include-dir=/software/cuda_toolkits/cuda-11.7/include --with-cuda-lib-dir=/software/cuda_toolkits/cuda-11.7/lib64 --disable-parallel-worker --disable-simgrid --disable-build-examples CC=icc CXX=icpc FC=ifort --no-create --no-recursion

Distribution

RHEL 8.6

Version of StarPU

master

Version of GPU drivers

CUDA 12.

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message