Double prefetch using parallel worker
Steps to reproduce
Take examples/parallel_workers/parallel_workers.c and change call to starpu_parallel_worker_init() with:
char level[256] = "numa";
hwloc_obj_type_t pw_level;
if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 )
{
fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\" does not match an hwloc level.\n", level );
exit(1);
}
parallel_workers = starpu_parallel_worker_init(pw_level,
//STARPU_PARALLEL_WORKER_POLICY_NAME, "dmdas",
//STARPU_PARALLEL_WORKER_PARTITION_ONE,
//STARPU_PARALLEL_WORKER_NEW,
// STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_OPENMP,
STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_INTEL_OPENMP_MKL,
STARPU_PARALLEL_WORKER_NB, 1, //2,
//STARPU_PARALLEL_WORKER_NCORES, 1,
0);
Obtained behavior
When you run ./parallel_workers on a node with 4 A100 GPU and 2 sockets AMD 7763 you get:
[starpu][_starpu_prefetch_task_input_prio][assert failure] Prefetching was already requested for this task! Did you set 'prefetches' to 1 in the starpu_sched_policy structure?
lt-parallel_workers: /home_nfs/blacostex/chameleon/starpu/src/datawizard/coherency.c:1002: _starpu_prefetch_task_input_prio: Assertion `0 && "prefetch != STARPU_PREFETCH || !task->prefetched"' failed.
bt full
(gdb) #0 0x00007fffc770fa9f in raise () from /lib64/libc.so.6
No symbol table info available.
#1 0x00007fffc76e2e05 in abort () from /lib64/libc.so.6
No symbol table info available.
#2 0x00007fffc76e2cd9 in __assert_fail_base.cold.0 () from /lib64/libc.so.6
No symbol table info available.
#3 0x00007fffc77083f6 in __assert_fail () from /lib64/libc.so.6
No symbol table info available.
#4 0x00007ffff720f668 in _starpu_prefetch_task_input_prio (task=0x2,
target_node=-1495026816, worker=0, prio=-948897121, prefetch=STARPU_FETCH)
at /home_nfs/blacostex/chameleon/starpu/src/datawizard/coherency.c:1002
handle = 0xbe71a0
mode = <optimized out>
node = 0
replicate = <optimized out>
j = <optimized out>
nbuffers = <optimized out>
index = <optimized out>
__ptrs = {0x7ffff720f61a <_starpu_prefetch_task_input_prio+1610>,
0x7ffff71c7fe4 <_starpu_push_task_on_specific_worker+132>,
0x7ffff71c7c16 <_starpu_push_task_to_workers+2630>,
0x7ffff71c6a74 <_starpu_repush_task+724>,
0x7ffff71d5d37 <starpu_sched_ctx_move_task_to_ctx_locked+151>,
0x7ffff71e20ef <ws_pop_task+1903>,
0x7ffff71c951c <_starpu_pop_task+1708>,
0x7ffff720a1e8 <_starpu_get_worker_task+424>,
0x7ffff72d6969 <_starpu_cpu_driver_run_once+937>,
0x7ffff72d6449 <_starpu_cpu_worker+297>,
0x7ffff0a941cf <start_thread+239>, 0x7fffc76fadd3 <clone+67>, 0x0,
0x840, 0x1, 0x0, 0x7ffff7b16bc8 <_starpu_config+1013640>,
0x7ffff717d56d <starpu_worker_can_execute_task_first_impl+125>, 0x0,
0x7ffff7b16bc8 <_starpu_config+1013640>, 0x0, 0x0, 0x3000000002,
0x7c0000005b, 0x0, 0x0, 0x0, 0x6e00000077, 0x7fffa6e3bc80,
0x7fff80000020, 0x8, 0x4851fa1e2497d100}
__n = <optimized out>
#5 0x00007ffff71c7fe4 in _starpu_push_task_on_specific_worker (task=0x2,
workerid=-1495026816)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_policy.c:460
nbasic_workers = <optimized out>
is_basic_worker = 1
worker = 0x7ffff7a22900 <_starpu_config+13504>
combined_worker = 0x0
#6 0x00007ffff71c7c16 in _starpu_push_task_to_workers (task=0x2)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_policy.c:747
__args = 0x0
sched_ctx = <optimized out>
ret = 0
#7 0x00007ffff71c6a74 in _starpu_repush_task (j=0x2)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_policy.c:690
able = <optimized out>
task = 0xbe71a0
sched_ctx = <optimized out>
ret = <optimized out>
can_push = 1
continuation = 0
__ptrs = {0x7fff4c7d6b00, 0x0, 0x7fff0000000c, 0x3, 0x4,
0x7fff4c587c40, 0x71053f3d141,
0x7ffff7574d58 <starpu_sched_ctx_move_task_to_ctx_locked@got.plt>,
0x7fffa6e52000, 0x1, 0x5, 0x7ffff7b18f60 <_starpu_config+1022752>,
0x7ffff7b18848 <_starpu_config+1020936>,
0x7ffff7de3574 <_dl_fixup+212>, 0x5, 0x0, 0x1, 0x7ffff71351e0,
0x7fffa6e3bf80, 0x7ffff7dd0c6e <_dl_runtime_resolve_xsavec+126>,
0x1, 0x1, 0x1, 0x1, 0xbe71a0,
0x7ffff7b16ca0 <_starpu_config+1013856>,
0x7ffff7a1f440 <_starpu_config>, 0xc1b400, 0x1,
0x7ffff7b16c18 <_starpu_config+1013720>,
0x7ffff7b18848 <_starpu_config+1020936>,
0x7ffff71516b7 <_starpu_barrier_counter_increment+87>}
__n = <optimized out>
len = <optimized out>
nbargs_str = <optimized out>
nbargs = <optimized out>
total_len = <optimized out>
futargs = 0x7ffff7b16bc8 <_starpu_config+1013640>
__args = 0x7ffff7b16bc8 <_starpu_config+1013640>
__args = 0x0
#8 0x00007ffff71d5d37 in starpu_sched_ctx_move_task_to_ctx_locked (task=0x2,
sched_ctx=2799940480, with_repush=0)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_ctx.c:2321
j = 0xc1b400
__ptrs = {0x7fff4c5842b0, 0x0, 0x7fff4c5842e0, 0x7fff4c584310,
0x7fff4c584310, 0x0, 0x7fff4c584340, 0x7fff4c584370, 0x7fff4c584370,
0x0, 0x7fff4c5843a0, 0x7fff4c5843d0, 0xc20f20,
0x7ffff7a29c80 <_starpu_config+43072>, 0x0,
0x7ffff72095f7 <_starpu_driver_end_job+343>,
0x7ffff7b16e30 <_starpu_config+1014256>, 0x7, 0x0,
0x7fffc7d42341 <__kmp_release_ticket_lock+97>, 0x0, 0x7fff0000000f,
0x95ea50, 0xbe71a0, 0x2, 0x8000000000000006, 0x0, 0x0, 0x0, 0x0,
0x0, 0x0}
__n = <optimized out>
#9 0x00007ffff71e20ef in _starpu_sched_ctx_worker_is_master_for_child_ctx (
sched_ctx_id=<optimized out>, workerid=<optimized out>,
task=<optimized out>)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_ctx.h:312
child_sched_ctx = 0
#10 ws_pop_task (sched_ctx_id=2)
at /home_nfs/blacostex/chameleon/starpu/src/sched_policies/work_stealing_policy.c:593
ws = 0x7fffa6e52000
task = 0xbe71a0
workerid = <optimized out>
victim = -139360184
#11 0x00007ffff71c951c in _starpu_pop_task (worker=0x2)
at /home_nfs/blacostex/chameleon/starpu/src/core/sched_policy.c:1060
sched_ctx = 0x7ffff7b18848 <_starpu_config+1020936>
task = <optimized out>
worker_id = <optimized out>
node = <optimized out>
profiling = <optimized out>
pop_start_time = {tv_sec = 0, tv_nsec = 0}
i = 4155607112
nbuffers = 4154586176
#12 0x00007ffff720a1e8 in _starpu_get_worker_task (worker=0x2,
workerid=-1495026816, memnode=0)
at /home_nfs/blacostex/chameleon/starpu/src/drivers/driver_common/driver_common.c:431
__ptrs = {0x0, 0x7ffff7216f53 <_starpu_handle_node_idle_requests+67>,
0x7fffa6e3c53c, 0x3, 0x0, 0x0, 0x48470121,
0x7ffff7b16498 <_starpu_config+1011800>, 0x0, 0x0, 0x0, 0x0, 0x1001,
0x0 <repeats 11 times>, 0x1001, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
__n = <optimized out>
p_ret = <optimized out>
task = 0x7ffff7a075a4 <_starpu_profiling>
keep_awake = <optimized out>
p_ret = <optimized out>
__ptrs = {0x0, 0x7ffff7216f53 <_starpu_handle_node_idle_requests+67>,
0x7fffa6e3c53c, 0x3, 0x0, 0x0, 0x48470121,
0x7ffff7b16498 <_starpu_config+1011800>, 0x0, 0x0, 0x0, 0x0, 0x1001,
0x0 <repeats 11 times>, 0x1001, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0}
__n = <optimized out>
#13 0x00007ffff72d6969 in _starpu_cpu_driver_run_once (cpu_worker=0x2)
at /home_nfs/blacostex/chameleon/starpu/src/drivers/cpu/driver_cpu.c:606
memnode = 0
workerid = 1
pi = {conf = 0x0, event_type = starpu_prof_tool_event_start_transfer,
starpu_version = {1, 4, 99}, thread_id = -1494933504, worker_id = 1,
device_number = 1, driver_type = starpu_prof_tool_driver_cpu,
memnode = 0, bytes_to_transfer = 0, bytes_transfered = 0,
fun_ptr = 0x0}
res = <optimized out>
j = <optimized out>
task = 0x0
pending_task = 0x0
rank = 0
continuation_wake_up = <optimized out>
__ptrs = {0x0, 0x7ffff7b16e30 <_starpu_config+1014256>, 0x0,
0x7ffff7fe08d0 <fut_active>, 0x0 <repeats 28 times>}
__n = <optimized out>
#14 0x00007ffff72d6449 in _starpu_cpu_worker (arg=0x2)
at /home_nfs/blacostex/chameleon/starpu/src/drivers/cpu/driver_cpu.c:712
__args = 0x0
worker = <optimized out>
pi = {conf = 0x0, event_type = starpu_prof_tool_event_start_transfer,
starpu_version = {1, 4, 99}, thread_id = -1494933504, worker_id = 1,
device_number = 1, driver_type = starpu_prof_tool_driver_cpu,
memnode = 0, bytes_to_transfer = 0, bytes_transfered = 0,
fun_ptr = 0x0}
__args = 0x0
#15 0x00007ffff0a941cf in start_thread () from /lib64/libpthread.so.0
No symbol table info available.
#16 0x00007fffc76fadd3 in clone () from /lib64/libc.so.6
No symbol table info available.
Expected behavior
Example run normally
Configuration
/home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-intel18_cuda11.7_hpcx --disable-build-doc --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-opencl --enable-icc --with-mpicc=mpicc --enable-cuda --with-cuda-include-dir=/software/cuda_toolkits/cuda-11.7/include --with-cuda-lib-dir=/software/cuda_toolkits/cuda-11.7/lib64 --enable-fxt --enable-maxnumanodes=9 --enable-max-sched-ctxs=32 --disable-simgrid --disable-build-examples CC=icc CXX=icpc FC=ifort --no-create --no-recursion
Configuration result
Distribution
RHEL 8.6
Version of StarPU
Latest Master
Version of GPU drivers
CUDA Version 12.0