Data coherency and partition planning
Steps to reproduce
Build starpu, comment the assert preventing from partitioning data registered with auto-allocation (-1) (Not sure it is required but at least to run my reproducer) Build this reproducer and run it:
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2010-2023 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
* Copyright (C) 2010 Mehdi Juhoor
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
/*
* This examplifies how to use partitioning filters. We here just split a
* vector into slices, and run a dumb kernel on them.
*/
#include <starpu.h>
#define NX 21
#define PARTS 3
#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)
static void init_cpu_func(void *buffers[], void *cl_arg)
{
int i;
/* length of the vector */
int n = (int)STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
int *vector = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
for (i = 0; i < n; i++)
vector[i] = i;
}
static void print_cpu_func(void *buffers[], void *cl_arg)
{
int i;
/* length of the vector */
int n = (int)STARPU_VECTOR_GET_NX(buffers[0]);
/* local copy of the vector pointer */
int *vector = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
char *prefix = (char *) cl_arg;
FPRINTF(stderr,"%s Vector: ", prefix);
for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
FPRINTF(stderr,"\n");
}
extern void vector_cpu_func(void *buffers[], void *cl_arg);
#ifdef STARPU_USE_CUDA
extern void vector_cuda_func(void *buffers[], void *cl_arg);
#endif
#ifdef STARPU_USE_HIP
extern void vector_hip_func(void *buffers[], void *cl_arg);
#endif
int main(void)
{
int i;
int* vector;
starpu_data_handle_t handle;
int factor=1;
int ret;
struct starpu_codelet init_cl =
{
.cpu_funcs = {init_cpu_func},
.cpu_funcs_name = {"init_cpu_func"},
.nbuffers = 1,
.modes = {STARPU_W},
.name = "vector_init"
};
struct starpu_codelet print_cl =
{
.cpu_funcs = {print_cpu_func},
.cpu_funcs_name = {"print_cpu_func"},
.nbuffers = 1,
.modes = {STARPU_R},
.name = "vector_print"
};
struct starpu_codelet cl =
{
.cpu_funcs = {vector_cpu_func},
.cpu_funcs_name = {"vector_cpu_func"},
#ifdef STARPU_USE_CUDA
.cuda_funcs = {vector_cuda_func},
.cuda_flags = {STARPU_CUDA_ASYNC},
#endif
#ifdef STARPU_USE_HIP
.hip_funcs = {vector_hip_func},
.hip_flags = {STARPU_HIP_ASYNC},
#endif
.nbuffers = 1,
.modes = {STARPU_RW},
.name = "vector_scal"
};
ret = starpu_init(NULL);
if (ret == -ENODEV)
exit(77);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");
/* Declare data to StarPU */
starpu_vector_data_register(&handle, -1, (uintptr_t)vector, NX, sizeof(vector[0]));
struct starpu_task *init_task = starpu_task_create();
init_task->handles[0] = handle;
init_task->cl = &init_cl;
init_task->synchronous = 1;
ret = starpu_task_submit(init_task);
if (ret == -ENODEV) goto enodev;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
char prefix_in[3] = "IN";
struct starpu_task *print_task = starpu_task_create();
print_task->handles[0] = handle;
print_task->cl = &print_cl;
print_task->synchronous = 1;
print_task->cl_arg = prefix_in;
print_task->cl_arg_size = 3*sizeof(char);
ret = starpu_task_submit(print_task);
if (ret == -ENODEV) goto enodev;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
/* Partition the vector in PARTS sub-vectors */
struct starpu_data_filter f =
{
.filter_func = starpu_vector_filter_block,
.nchildren = PARTS
};
starpu_data_handle_t children[PARTS];
#define PLAN 1
if (PLAN) {
starpu_data_partition_plan(handle, &f, children);
} else {
starpu_data_partition(handle, &f);
}
/* Submit a task on each sub-vector */
for (i=0; i<PARTS; i++)
{
struct starpu_task *task = starpu_task_create();
factor *= 10;
if (PLAN) {
task->handles[0] = children[i];
}
else {
task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
}
task->cl = &cl;
task->synchronous = 1;
task->cl_arg = &factor;
task->cl_arg_size = sizeof(factor);
ret = starpu_task_submit(task);
if (ret == -ENODEV) goto enodev;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
}
/* Unpartition the data, unregister it from StarPU and shutdown */
if (PLAN) {
starpu_data_partition_clean(handle, PARTS, children);
} else {
starpu_data_unpartition(handle, STARPU_MAIN_RAM);
}
char prefix_out[4] = "OUT";
struct starpu_task *print_task_out = starpu_task_create();
print_task_out->handles[0] = handle;
print_task_out->cl = &print_cl;
print_task_out->synchronous = 1;
print_task_out->cl_arg = prefix_out;
print_task_out->cl_arg_size = 4*sizeof(char);
ret = starpu_task_submit(print_task_out);
if (ret == -ENODEV) goto enodev;
STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
starpu_data_unregister(handle);
starpu_shutdown();
return 0;
enodev:
FPRINTF(stderr, "WARNING: No one can execute this task\n");
starpu_shutdown();
return 77;
}
Obtained behavior
IN Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
OUT Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
Expected behavior
IN Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
OUT Vector: 0 10 20 30 40 50 60 700 800 900 1000 1100 1200 1300 14000 15000 16000 17000 18000 19000 20000
Configuration
/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel18_cuda11.7_hpcx --disable-build-doc --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-opencl --with-mpicc=mpicc --enable-cuda --with-cuda-include-dir=/software/cuda_toolkits/cuda-11.7/include --with-cuda-lib-dir=/software/cuda_toolkits/cuda-11.7/lib64 --enable-fxt --enable-maxnumanodes=8 --enable-max-sched-ctxs=32 --disable-parallel-worker --disable-simgrid --disable-build-examples CC=icc CXX=icpc FC=ifort --no-create --no-recursion
But behaviour first observed without CUDA
Distribution
Git master
Version of StarPU
Version of GPU drivers
Not related to GPU (same issue reproduced with CPU only)