Deadloc with filtering and handle used twice

Steps to reproduce

Modify examples/filters/fmatrix.c to add a GEMM update [[0 1 2 ] | [3 4 ] ] [[5 6 7 ] | [8 9 ] ] [[10 11 12 ] | [13 14] ]

[[15 16 17 ] | [18 19] ] Update from [15 16 17] and [[3 4][8 9][13 14]] on [18 19].

Diff of the modifications:

[blacostex@l0 (spartan) starpu]$ git diff examples/filters/                                                                                                                                                                                                           [66/1919]
diff --git a/examples/filters/fmatrix.c b/examples/filters/fmatrix.c                                                                                                         
index c3767362f..fdf0db5e6 100644                                                                                                            
--- a/examples/filters/fmatrix.c                      
+++ b/examples/filters/fmatrix.c                                                                                                                              
@@ -28,6 +28,7 @@                                            
 #define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

 extern void matrix_cpu_func(void *buffers[], void *cl_arg);
+extern void matrix_cpu_gemm_func(void *buffers[], void *cl_arg);

 #ifdef STARPU_USE_CUDA
 extern void matrix_cuda_func(void *buffers[], void *cl_arg);
@@ -40,6 +41,11 @@ extern void matrix_hip_func(void *buffers[], void *cl_arg);
 extern void generate_matrix_data(int *matrix, int nx, int ny, unsigned ld);
 extern void print_matrix_data(starpu_data_handle_t matrix_handle);

+struct cl_zgemm_args_s {
+       int m, n, k, offsetA, offsetB, offsetC;
+       
+};
+
 int main(void)
 {
        unsigned j;
@@ -99,7 +105,54 @@ int main(void)
                ret = starpu_task_submit(task);
                if (ret == -ENODEV) goto enodev;
                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+               FPRINTF(stderr," Matrix i \n");
+               
+               print_matrix_data(starpu_data_get_sub_data(handle, 1, i));
+
        }
+       
+       starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+
+       FPRINTF(stderr," Matrix: before GEMM \n");
+       print_matrix_data(handle);
+       starpu_data_partition(handle, &f);
+
+       struct starpu_task *task = starpu_task_create();
+       task->handles[0] = starpu_data_get_sub_data(handle, 1, 1);
+       task->handles[1] = starpu_data_get_sub_data(handle, 1, 0);
+       struct starpu_codelet cl2 =
+       {
+               .cpu_funcs = {matrix_cpu_gemm_func},
+               .cpu_funcs_name = {"matrix_cpu_gemm_func"},
+               .nbuffers = 2,
+               .modes = {STARPU_R, STARPU_RW},
+               .name = "matrix_gemm"
+       };
+       task->cl = &cl2;
+       task->synchronous = 1;
+       int nxa = 3;
+       int nya = 4;
+       unsigned lda = 5;
+       int nxb = 2;
+       int nyb = 4;
+       unsigned ldb = 5;
+       int nxc = 2;
+       int nyc = 4;
+       unsigned ldc = 5;
+                                                                                                                                                                                                                                                                      [3/1919]
+       struct cl_zgemm_args_s clarg = {.m = NY - nxa,
+                                       .n = NX - nxa,
+                                       .k = nxa,
+                                       .offsetA = (nxa)*lda,
+                                       .offsetB = 0,
+                                       .offsetC = (nxa)*ldc};
+
+       task->cl_arg = &clarg;
+       task->cl_arg_size = sizeof(clarg);
+
+       ret = starpu_task_submit(task);
+       if (ret == -ENODEV) goto enodev;
+       STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

        /* Unpartition the data, unregister it from StarPU and shutdown */
        starpu_data_unpartition(handle, STARPU_MAIN_RAM);
diff --git a/examples/filters/fmatrix_cpu.c b/examples/filters/fmatrix_cpu.c
index 83f432c01..eb5edae2f 100644
--- a/examples/filters/fmatrix_cpu.c
+++ b/examples/filters/fmatrix_cpu.c
@@ -37,3 +37,44 @@ void matrix_cpu_func(void *buffers[], void *cl_arg)
        }
 }

+struct cl_zgemm_args_s {
+       int m, n, k, offsetA, offsetB, offsetC;
+       
+};
+
+void matrix_cpu_gemm_func(void *buffers[], void *cl_arg)
+{
+       int i, j, k;
+       struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
+
+       /* length of the matrix */
+       int nxa = (int)STARPU_MATRIX_GET_NX(buffers[0]);
+       int nya = (int)STARPU_MATRIX_GET_NY(buffers[0]);
+       unsigned lda = STARPU_MATRIX_GET_LD(buffers[0]);
+       int nxb = (int)STARPU_MATRIX_GET_NX(buffers[1]);
+       int nyb = (int)STARPU_MATRIX_GET_NY(buffers[1]);
+       unsigned ldb = STARPU_MATRIX_GET_LD(buffers[1]);
+       int nxc = (int)STARPU_MATRIX_GET_NX(buffers[1]);
+       int nyc = (int)STARPU_MATRIX_GET_NY(buffers[1]);
+       unsigned ldc = STARPU_MATRIX_GET_LD(buffers[1]);
+       /* local copy of the matrix pointer */
+       int *matrixa = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+       int *matrixb = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+       int *matrixc = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+       fprintf(stderr, "GEMM m %d, n %d ,k %d, offsets(%d,%d,%d)\n", clargs->m, clargs->n, clargs->k, clargs->offsetA, clargs->offsetB, clargs->offsetC );
+       for(j=0; j<clargs->m ; j++)
+               for(i=0; i<clargs->n ; i++)
+                       for (k = 0; k < clargs->k; k++) {
+                               fprintf(stderr, "matrixc[%d] %d  += matrixa[%d] %d * matrixb[%d] %d i,j,k %d,%d,%d\n",
+                                       clargs->offsetC + (j*ldc)+i,
+                                       matrixc[clargs->offsetC + (j*ldc)+i],
+                                       clargs->offsetA + (j*lda)+k,
+                                       matrixa[clargs->offsetA + (j*lda)+k],
+                                       clargs->offsetB + (k*ldb+i),
+                                       matrixb[clargs->offsetB + (k*ldb +i)], i,j,k);
+                               matrixc[clargs->offsetC + (j*ldc)+i] += matrixa[clargs->offsetA + j*lda+k] *
+                                       matrixb[clargs->offsetB + k*ldb + i];
+                       }
+}
+

Obtained behavior

Deadlock before GEMM update kernel is called.

Expected behavior

Compute GEMM (maybe my algorithm is not correct but it's not the point here)

Configuration

/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel20.4.304_hpcx --enable-icc --with-mpicc=mpicc --disable-cuda --disable-opencl --enable-maxnumanodes=9 --disable-parallel-worker --disable-simgrid --disable-build-examples

Distribution

RH 8.6

Version of StarPU

git master d399d2c6

Version of GPU drivers

CPU only

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message