Partitionned data fetched twice on GPU
Steps to reproduce
Modify filters/fmatrix.c so that it performs filtered operations on CPU and 2 same operations using one filtered data and one other data.
See attached diff
diff --git a/examples/filters/fmatrix.c b/examples/filters/fmatrix.c
index c3767362f..97e4d0ecb 100644
--- a/examples/filters/fmatrix.c
+++ b/examples/filters/fmatrix.c
@@ -40,31 +40,120 @@ extern void matrix_hip_func(void *buffers[], void *cl_arg);
extern void generate_matrix_data(int *matrix, int nx, int ny, unsigned ld);
extern void print_matrix_data(starpu_data_handle_t matrix_handle);
+void matrix2_cuda_func( void * buffer[], void *cl_arg){
+}
+void starpu_matrix_filter_block2D(void *father_interface, void *child_interface, STARPU_ATTRIBUTE_UNUSED struct starpu_data_filter *f, unsigned id, unsigned nchunks)
+{
+ struct starpu_matrix_interface *matrix_father = (struct starpu_matrix_interface *) father_interface;
+ struct starpu_matrix_interface *matrix_child = (struct starpu_matrix_interface *) child_interface;
+
+ unsigned blocksize;
+ /* the element will be split, in case horizontal, it's nx, in case vertical, it's ny*/
+ uint32_t nn;
+ uint32_t nx;
+ uint32_t ny;
+
+ int rootSquareNChunks = 1;
+ while (rootSquareNChunks*rootSquareNChunks < nchunks) rootSquareNChunks++;
+ STARPU_ASSERT_MSG(nchunks == rootSquareNChunks*rootSquareNChunks, "Cannot split in non square number of parts");
+
+ /* actual number of elements */
+ nx = matrix_father->nx;
+ ny = matrix_father->ny;
+ blocksize = matrix_father->ld;
+
+ size_t elemsize = matrix_father->elemsize;
+
+ //STARPU_ASSERT_MSG(nchunks <= nn, "cannot split %u elements in %u parts", nn, nchunks);
+
+ uint32_t child_nx, child_ny;
+ size_t offset;
+
+ starpu_filter_nparts_compute_chunk_size_and_offset(nx, rootSquareNChunks, elemsize, id, 1, &child_nx, &offset);
+ starpu_filter_nparts_compute_chunk_size_and_offset(ny, rootSquareNChunks, elemsize, id, blocksize, &child_ny, &offset);
+
+ STARPU_ASSERT_MSG(matrix_father->id == STARPU_MATRIX_INTERFACE_ID, "%s can only be applied on a matrix data", __func__);
+
+ /* update the child's interface */
+ matrix_child->id = matrix_father->id;
+
+ matrix_child->nx = child_nx;
+ matrix_child->ny = child_ny;
+ matrix_child->elemsize = elemsize;
+
+ /* is the information on this node valid ? */
+ if (matrix_father->dev_handle)
+ {
+ if (matrix_father->ptr)
+ matrix_child->ptr = matrix_father->ptr + offset;
+ matrix_child->ld = matrix_father->ld;
+ matrix_child->dev_handle = matrix_father->dev_handle;
+ matrix_child->offset = matrix_father->offset + offset;
+ matrix_child->allocsize = matrix_child->ld * matrix_child->ny * elemsize;
+ }
+ else
+ matrix_child->allocsize = matrix_child->nx * matrix_child->ny * elemsize;
+}
+
+void matrix_init_cpu_func(void *buffers[], void *cl_arg)
+{
+ int i, j;
+ int *factor = (int *) cl_arg;
+
+ /* length of the matrix */
+ int nx = (int)STARPU_MATRIX_GET_NX(buffers[0]);
+ int ny = (int)STARPU_MATRIX_GET_NY(buffers[0]);
+ unsigned ld = STARPU_MATRIX_GET_LD(buffers[0]);
+ /* local copy of the matrix pointer */
+ int *matrix = (int *)STARPU_MATRIX_GET_PTR(buffers[0]);
+
+ generate_matrix_data(matrix, nx, ny, ld);
+}
+
+
int main(void)
{
unsigned j;
- int *matrix;
+ int *matrix, *matrix2;
int ret, i;
int factor = 12;
- starpu_data_handle_t handle;
+ starpu_data_handle_t handle, handle2;
struct starpu_codelet cl =
{
.cpu_funcs = {matrix_cpu_func},
.cpu_funcs_name = {"matrix_cpu_func"},
-#ifdef STARPU_USE_CUDA
- .cuda_funcs = {matrix_cuda_func},
- .cuda_flags = {STARPU_CUDA_ASYNC},
-#endif
-#ifdef STARPU_USE_HIP
- .hip_funcs = {matrix_hip_func},
- .hip_flags = {STARPU_HIP_ASYNC},
-#endif
+/* #ifdef STARPU_USE_CUDA */
+/* .cuda_funcs = {matrix_cuda_func}, */
+/* .cuda_flags = {STARPU_CUDA_ASYNC}, */
+/* #endif */
+/* #ifdef STARPU_USE_HIP */
+/* .hip_funcs = {matrix_hip_func}, */
+/* .hip_flags = {STARPU_HIP_ASYNC}, */
+/* #endif */
.nbuffers = 1,
.modes = {STARPU_RW},
.name = "matrix_scal"
};
+ struct starpu_codelet cl2 =
+ {
+ .cuda_funcs = {matrix2_cuda_func},
+ .cuda_flags = {STARPU_CUDA_ASYNC},
+ .nbuffers = 2,
+ .modes = {STARPU_R,STARPU_RW},
+ .name = "matrix_scal"
+ };
+
+ struct starpu_codelet cl_init =
+ {
+ .cpu_funcs = {matrix_init_cpu_func},
+ .cpu_funcs_name = {"matrix_init_cpu_func"},
+ .nbuffers = 1,
+ .modes = {STARPU_W},
+ .name = "matrix_init"
+ };
+
ret = starpu_init(NULL);
if (ret == -ENODEV)
exit(77);
@@ -72,9 +161,19 @@ int main(void)
starpu_malloc((void **)&matrix, NX*NY*sizeof(int));
generate_matrix_data(matrix, NX, NY, NX);
+ starpu_malloc((void **)&matrix2, NX*NY*sizeof(int));
+ generate_matrix_data(matrix2, NX, NY, NX);
/* Declare data to StarPU */
- starpu_matrix_data_register(&handle, STARPU_MAIN_RAM, (uintptr_t)matrix, NX, NX, NY, sizeof(matrix[0]));
+ starpu_matrix_data_register(&handle, STARPU_MAIN_RAM , (uintptr_t)matrix, NX, NX, NY, sizeof(int));
+ starpu_matrix_data_register(&handle2, STARPU_MAIN_RAM , (uintptr_t)matrix2, NX, NX, NY, sizeof(int));
+ struct starpu_task *task_init = starpu_task_create();
+ task_init->handles[0] = handle;//starpu_data_get_sub_data(handle, 1, i);
+ task_init->cl = &cl_init;
+ task_init->synchronous = 1;
+
+ ret = starpu_task_submit(task_init);
+
FPRINTF(stderr,"IN Matrix: \n");
print_matrix_data(handle);
@@ -84,13 +183,22 @@ int main(void)
.filter_func = starpu_matrix_filter_block,
.nchildren = PARTS
};
- starpu_data_partition(handle, &f);
+ starpu_data_handle_t children[f.nchildren];
+ starpu_data_partition_plan(handle, &f, children);
+
+ struct starpu_data_filter f2 =
+ {
+ .filter_func = starpu_matrix_filter_block2D,
+ .nchildren = PARTS*PARTS
+ };
+ starpu_data_handle_t children2[f2.nchildren];
+ starpu_data_partition_plan(handle, &f2, children2);
/* Submit a task on each sub-vector */
- for (i=0; i<starpu_data_get_nb_children(handle); i++)
+ for (i=0; i<f2.nchildren; i++)
{
struct starpu_task *task = starpu_task_create();
- task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
+ task->handles[0] = children2[i];
task->cl = &cl;
task->synchronous = 1;
task->cl_arg = &factor;
@@ -101,13 +209,54 @@ int main(void)
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
}
+
+ /* Submit a task on each sub-vector */
+ for (i=0; i<f.nchildren; i++)
+ {
+ struct starpu_task *task = starpu_task_create();
+ task->handles[0] = children[i];
+ task->cl = &cl;
+ task->synchronous = 1;
+ task->cl_arg = &factor;
+ task->cl_arg_size = sizeof(factor);
+
+ ret = starpu_task_submit(task);
+ if (ret == -ENODEV) goto enodev;
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+ }
+
+ struct starpu_task *task = starpu_task_create();
+ task->handles[0] = children[1];
+ task->handles[1] = handle2;
+ task->cl = &cl2;
+ task->synchronous = 1;
+ task->cl_arg = &factor;
+ task->cl_arg_size = sizeof(factor);
+
+ ret = starpu_task_submit(task);
+ if (ret == -ENODEV) goto enodev;
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
+ struct starpu_task *task2 = starpu_task_create();
+ task2->handles[0] = children[1];
+ task2->handles[1] = handle2;
+ task2->cl = &cl2;
+ task2->synchronous = 1;
+ task2->cl_arg = &factor;
+ task2->cl_arg_size = sizeof(factor);
+
+ ret = starpu_task_submit(task2);
+ if (ret == -ENODEV) goto enodev;
+ STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
+
/* Unpartition the data, unregister it from StarPU and shutdown */
- starpu_data_unpartition(handle, STARPU_MAIN_RAM);
+ starpu_data_partition_clean(handle, f.nchildren, children);
+ starpu_data_partition_clean(handle, f2.nchildren, children2);
FPRINTF(stderr,"OUT Matrix: \n");
print_matrix_data(handle);
starpu_data_unregister(handle);
- starpu_free_noflag(matrix, NX*NY*sizeof(int));
+ //starpu_free_noflag(matrix, NX*NY*sizeof(int));
starpu_shutdown();
return ret;
Obtained behavior
The data is fetched twice on the GPU. See attached trace and capture.[example.trace](/uploads/db004b57c8381c9aeb340d61a2cce95d/example.trace
Expected behavior
The data should be fetch once as it is the same for both GPU tasks
Configuration
$ /home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel18_cuda11.7_hpcx --disable-build-doc --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-opencl --enable-icc --with-mpicc=mpicc --enable-cuda --enable-fxt --enable-maxnumanodes=9 --enable-max-sched-ctxs=32 --with-cuda-include-dir=/software/cuda_toolkits/cuda-11.7/include --with-cuda-lib-dir=/software/cuda_toolkits/cuda-11.7/lib64 --disable-parallel-worker --disable-simgrid --disable-build-examples CC=icc CXX=icpc FC=ifort --no-create --no-recursion
Distribution
RHEL 8.6
Version of StarPU
master
Version of GPU drivers
CUDA 12.