Data coherency and partition planning

Steps to reproduce

Build starpu, comment the assert preventing from partitioning data registered with auto-allocation (-1) (Not sure it is required but at least to run my reproducer) Build this reproducer and run it:

/* StarPU --- Runtime system for heterogeneous multicore architectures.                                                                                                                                            
 *                                                                                                                                                                                                                 
 * Copyright (C) 2010-2023  Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria                                                                                                                                   
 * Copyright (C) 2010       Mehdi Juhoor                                                                                                                                                                           
 *                                                                                                                                                                                                                 
 * StarPU is free software; you can redistribute it and/or modify                                                                                                                                                  
 * it under the terms of the GNU Lesser General Public License as published by                                                                                                                                     
 * the Free Software Foundation; either version 2.1 of the License, or (at                                                                                                                                         
 * your option) any later version.                                                                                                                                                                                 
 *                                                                                                                                                                                                                 
 * StarPU is distributed in the hope that it will be useful, but                                                                                                                                                   
 * WITHOUT ANY WARRANTY; without even the implied warranty of                                                                                                                                                      
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.                                                                                                                                                            
 *                                                                                                                                                                                                                 
 * See the GNU Lesser General Public License in COPYING.LGPL for more details.                                                                                                                                     
 */

/*                                                                                                                                                                                                                 
 * This examplifies how to use partitioning filters.  We here just split a                                                                                                                                         
 * vector into slices, and run a dumb kernel on them.                                                                                                                                                              
 */

#include <starpu.h>

#define NX    21
#define PARTS 3

#define FPRINTF(ofile, fmt, ...) do { if (!getenv("STARPU_SSILENT")) {fprintf(ofile, fmt, ## __VA_ARGS__); }} while(0)

static void init_cpu_func(void *buffers[], void *cl_arg)
{
        int i;

        /* length of the vector */
        int n = (int)STARPU_VECTOR_GET_NX(buffers[0]);
        /* local copy of the vector pointer */
        int *vector = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);

        for (i = 0; i < n; i++)
                vector[i] = i;
}


static void print_cpu_func(void *buffers[], void *cl_arg)
{
        int i;

        /* length of the vector */
        int n = (int)STARPU_VECTOR_GET_NX(buffers[0]);
        /* local copy of the vector pointer */
        int *vector = (int *)STARPU_VECTOR_GET_PTR(buffers[0]);
        char *prefix = (char *) cl_arg;

        FPRINTF(stderr,"%s      Vector: ", prefix);
        for(i=0 ; i<NX ; i++) FPRINTF(stderr, "%5d ", vector[i]);
        FPRINTF(stderr,"\n");
}

extern void vector_cpu_func(void *buffers[], void *cl_arg);

#ifdef STARPU_USE_CUDA
extern void vector_cuda_func(void *buffers[], void *cl_arg);
#endif
#ifdef STARPU_USE_HIP
extern void vector_hip_func(void *buffers[], void *cl_arg);
#endif

int main(void)
{
        int i;
        int* vector;
        starpu_data_handle_t handle;
        int factor=1;
        int ret;

        struct starpu_codelet init_cl =
        {
                .cpu_funcs = {init_cpu_func},
                .cpu_funcs_name = {"init_cpu_func"},
                .nbuffers = 1,
                .modes = {STARPU_W},
                .name = "vector_init"
        };

        struct starpu_codelet print_cl =
        {
                .cpu_funcs = {print_cpu_func},
                .cpu_funcs_name = {"print_cpu_func"},
                .nbuffers = 1,
                .modes = {STARPU_R},
                .name = "vector_print"
        };


        struct starpu_codelet cl =
        {
                .cpu_funcs = {vector_cpu_func},
                .cpu_funcs_name = {"vector_cpu_func"},
#ifdef STARPU_USE_CUDA
                .cuda_funcs = {vector_cuda_func},
                .cuda_flags = {STARPU_CUDA_ASYNC},
#endif
#ifdef STARPU_USE_HIP
                .hip_funcs = {vector_hip_func},
                .hip_flags = {STARPU_HIP_ASYNC},
#endif
               .nbuffers = 1,
                .modes = {STARPU_RW},
                .name = "vector_scal"
        };

        ret = starpu_init(NULL);
        if (ret == -ENODEV)
                exit(77);
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_init");

        /* Declare data to StarPU */
        starpu_vector_data_register(&handle, -1, (uintptr_t)vector, NX, sizeof(vector[0]));

        struct starpu_task *init_task = starpu_task_create();
        init_task->handles[0] = handle;
        init_task->cl = &init_cl;
        init_task->synchronous = 1;
        ret = starpu_task_submit(init_task);
        if (ret == -ENODEV) goto enodev;
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

        char prefix_in[3] = "IN";
        struct starpu_task *print_task = starpu_task_create();
        print_task->handles[0] = handle;
        print_task->cl = &print_cl;
        print_task->synchronous = 1;
        print_task->cl_arg = prefix_in;
        print_task->cl_arg_size = 3*sizeof(char);

        ret = starpu_task_submit(print_task);
        if (ret == -ENODEV) goto enodev;
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");


        /* Partition the vector in PARTS sub-vectors */
        struct starpu_data_filter f =
        {
                .filter_func = starpu_vector_filter_block,
                .nchildren = PARTS
        };
        starpu_data_handle_t children[PARTS];
#define PLAN 1
        if (PLAN) {
                starpu_data_partition_plan(handle, &f, children);
        } else {
                starpu_data_partition(handle, &f);
        }
        /* Submit a task on each sub-vector */
        for (i=0; i<PARTS; i++)
        {
                struct starpu_task *task = starpu_task_create();

                factor *= 10;
                if (PLAN) {
                        task->handles[0] = children[i];
                }
                else {
                        task->handles[0] = starpu_data_get_sub_data(handle, 1, i);
                }
                task->cl = &cl;
                task->synchronous = 1;
                task->cl_arg = &factor;
                task->cl_arg_size = sizeof(factor);

                ret = starpu_task_submit(task);
                if (ret == -ENODEV) goto enodev;
                STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");
        }

        /* Unpartition the data, unregister it from StarPU and shutdown */
        if (PLAN) {
                starpu_data_partition_clean(handle, PARTS, children);
        } else {
                starpu_data_unpartition(handle, STARPU_MAIN_RAM);
        }
        char prefix_out[4] = "OUT";
        struct starpu_task *print_task_out = starpu_task_create();
        print_task_out->handles[0] = handle;
        print_task_out->cl = &print_cl;
        print_task_out->synchronous = 1;
        print_task_out->cl_arg = prefix_out;
        print_task_out->cl_arg_size = 4*sizeof(char);
        ret = starpu_task_submit(print_task_out);
        if (ret == -ENODEV) goto enodev;
        STARPU_CHECK_RETURN_VALUE(ret, "starpu_task_submit");

        starpu_data_unregister(handle);

        starpu_shutdown();

        return 0;

enodev:
        FPRINTF(stderr, "WARNING: No one can execute this task\n");
        starpu_shutdown();
        return 77;
}

Obtained behavior

IN Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
OUT Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20

Expected behavior

IN Vector: 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
OUT Vector: 0 10 20 30 40 50 60 700 800 900 1000 1100 1200 1300 14000 15000 16000 17000 18000 19000 20000

Configuration

/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel18_cuda11.7_hpcx --disable-build-doc --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-opencl --with-mpicc=mpicc --enable-cuda --with-cuda-include-dir=/software/cuda_toolkits/cuda-11.7/include --with-cuda-lib-dir=/software/cuda_toolkits/cuda-11.7/lib64 --enable-fxt --enable-maxnumanodes=8 --enable-max-sched-ctxs=32 --disable-parallel-worker --disable-simgrid --disable-build-examples CC=icc CXX=icpc FC=ifort --no-create --no-recursion

But behaviour first observed without CUDA

Distribution

Git master

Version of StarPU

b088fa37

Version of GPU drivers

Not related to GPU (same issue reproduced with CPU only)

To upload designs, you'll need to enable LFS and have an admin enable hashed storage. More information

Admin message