Deadloc with filtering and handle used twice
Steps to reproduce
Modify examples/filters/fmatrix.c to add a GEMM update [[0 1 2 ] | [3 4 ] ] [[5 6 7 ] | [8 9 ] ] [[10 11 12 ] | [13 14] ]
[[15 16 17 ] | [18 19] ] Update from [15 16 17] and [[3 4][8 9][13 14]] on [18 19].
Diff of the modifications:
[blacostex@l0 (spartan) starpu]$ git diff examples/filters/ [66/1919]
diff --git a/examples/filters/fmatrix.c b/examples/filters/fmatrix.c
index c3767362f..fdf0db5e6 100644
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -28,6 +28,7 @@
#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
extern void matrix_cpu_func(void *buffers[], void *cl_arg);
+extern void matrix_cpu_gemm_func(void *buffers[], void *cl_arg);
#ifdef STARPU_USE_CUDA
extern void matrix_cuda_func(void *buffers[], void *cl_arg);
@@ -40,6 +41,11 @@ extern void matrix_hip_func(void *buffers[], void *cl_arg);
extern void generate_matrix_data(int *matrix, int nx, int ny, unsigned ld);
extern void print_matrix_data(starpu_data_handle_t matrix_handle);
+struct cl_zgemm_args_s {
+ int m, n, k, offsetA, offsetB, offsetC;
+
+};
+
int main(void)
{
unsigned j;
@@ -99,7 +105,54 @@ int main(void)
ret = starpu_task_submit(task);
if (ret == -ENODEV) goto enodev;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+ FPRINTF(stderr," Matrix i \n");
+
+ print_matrix_data(starpu_data_get_sub_data(handle, 1, i));
+
}
+
+ starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+
+ FPRINTF(stderr," Matrix: before GEMM \n");
+ print_matrix_data(handle);
+ starpu_data_partition(handle, &f);
+
+ struct starpu_task *task = starpu_task_create();
+ task->handles[0] = starpu_data_get_sub_data(handle, 1, 1);
+ task->handles[1] = starpu_data_get_sub_data(handle, 1, 0);
+ struct starpu_codelet cl2 =
+ {
+ .cpu_funcs = {matrix_cpu_gemm_func},
+ .cpu_funcs_name = {"matrix_cpu_gemm_func"},
+ .nbuffers = 2,
+ .modes = {STARPU_R, STARPU_RW},
+ .name = "matrix_gemm"
+ };
+ task->cl = &cl2;
+ task->synchronous = 1;
+ int nxa = 3;
+ int nya = 4;
+ unsigned lda = 5;
+ int nxb = 2;
+ int nyb = 4;
+ unsigned ldb = 5;
+ int nxc = 2;
+ int nyc = 4;
+ unsigned ldc = 5;
+ [3/1919]
+ struct cl_zgemm_args_s clarg = {.m = NY - nxa,
+ .n = NX - nxa,
+ .k = nxa,
+ .offsetA = (nxa)*lda,
+ .offsetB = 0,
+ .offsetC = (nxa)*ldc};
+
+ task->cl_arg = &clarg;
+ task->cl_arg_size = sizeof(clarg);
+
+ ret = starpu_task_submit(task);
+ if (ret == -ENODEV) goto enodev;
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
/* Unpartition the data, unregister it from StarPU and shutdown */
starpu_data_unpartition(handle, STARPU_MAIN_RAM);
diff --git a/examples/filters/fmatrix_cpu.c b/examples/filters/fmatrix_cpu.c
index 83f432c01..eb5edae2f 100644
--- a/examples/filters/fmatrix_cpu.c
+++ b/examples/filters/fmatrix_cpu.c
@@ -37,3 +37,44 @@ void matrix_cpu_func(void *buffers[], void *cl_arg)
}
}
+struct cl_zgemm_args_s {
+ int m, n, k, offsetA, offsetB, offsetC;
+
+};
+
+void matrix_cpu_gemm_func(void *buffers[], void *cl_arg)
+{
+ int i, j, k;
+ struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
+
+ /* length of the matrix */
+ int nxa = (int)STARPU_MATRIX_GET_NX(buffers[0]);
+ int nya = (int)STARPU_MATRIX_GET_NY(buffers[0]);
+ unsigned lda = STARPU_MATRIX_GET_LD(buffers[0]);
+ int nxb = (int)STARPU_MATRIX_GET_NX(buffers[1]);
+ int nyb = (int)STARPU_MATRIX_GET_NY(buffers[1]);
+ unsigned ldb = STARPU_MATRIX_GET_LD(buffers[1]);
+ int nxc = (int)STARPU_MATRIX_GET_NX(buffers[1]);
+ int nyc = (int)STARPU_MATRIX_GET_NY(buffers[1]);
+ unsigned ldc = STARPU_MATRIX_GET_LD(buffers[1]);
+ /* local copy of the matrix pointer */
+ int *matrixa = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+ int *matrixb = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+ int *matrixc = (int *)STARPU_MATRIX_GET_PTR(buffers[1]);
+
+ fprintf(stderr, "GEMM m %d, n %d ,k %d, offsets(%d,%d,%d)\n", clargs->m, clargs->n, clargs->k, clargs->offsetA, clargs->offsetB, clargs->offsetC );
+ for(j=0; j<clargs->m ; j++)
+ for(i=0; i<clargs->n ; i++)
+ for (k = 0; k < clargs->k; k++) {
+ fprintf(stderr, "matrixc[%d] %d += matrixa[%d] %d * matrixb[%d] %d i,j,k %d,%d,%d\n",
+ clargs->offsetC + (j*ldc)+i,
+ matrixc[clargs->offsetC + (j*ldc)+i],
+ clargs->offsetA + (j*lda)+k,
+ matrixa[clargs->offsetA + (j*lda)+k],
+ clargs->offsetB + (k*ldb+i),
+ matrixb[clargs->offsetB + (k*ldb +i)], i,j,k);
+ matrixc[clargs->offsetC + (j*ldc)+i] += matrixa[clargs->offsetA + j*lda+k] *
+ matrixb[clargs->offsetB + k*ldb + i];
+ }
+}
+
Obtained behavior
Deadlock before GEMM update kernel is called.
Expected behavior
Compute GEMM (maybe my algorithm is not correct but it's not the point here)
Configuration
/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel20.4.304_hpcx --enable-icc --with-mpicc=mpicc --disable-cuda --disable-opencl --enable-maxnumanodes=9 --disable-parallel-worker --disable-simgrid --disable-build-examples
Distribution
RH 8.6
Version of StarPU
git master d399d2c6
Version of GPU drivers
CPU only