Deadlock using starpu_mpi_data_cpy
Steps to reproduce
Build chameleon with starpu in synchronous mode (CHAMELEON_RUNTIME_SYNC=ON) using branch https://gitlab.inria.fr/satanas-atos/chameleon/-/tree/eviden/workspace_mpi_getrf_XL?ref_type=heads.
Run on a 2x2 bloc matrix pzgetrf_nopiv like:
export STARPU_SCHED=prio
export NBLOCK=2
export BLOCKSIZE=576
export STARPU_CHAMELEON_LOOKAHEAD=0
export CHAMELEON_STATS_BYSTEP=1
export STARPU_HOME=$PWD/test
export STARPU_CALIBRATE=2
export STARPU_FXT_PREFIX=$PWD/fxt
export STARPU_HISTORY_MAX_ERROR=10000000000
export STARPU_TRACE_BUFFER_SIZE=3000
export CHAMELEON_GENERIC=0
export nmpi=4
export nthreads=8 #$((128/$nmpi))
#srun -l -n $nmpi -c $nthreads --threads-per-core 1 ~/chameleon/hpl-scripts/chameleon-starpu/env/spartan/wrappers/wrapper_amd7763.sh valgrind --track-origins=yes ~/chameleon/chameleon-install-${TC}/bin/chameleon_dtesting -o dgetrf_nopiv -b $BLOCKSIZE -n $(($NBLOCK*$BLOCKSIZE)) -l 1 -H -P 1 --trace -c
srun -l -n $nmpi -c $nthreads --threads-per-core 1 ~/chameleon/hpl-scripts/chameleon-starpu/env/spartan/wrappers/wrapper_amd7763.sh ~/chameleon/chameleon-install-${TC}/bin/chameleon_dtesting -o dgetrf_nopiv -b $BLOCKSIZE -n $(($NBLOCK*$BLOCKSIZE)) -l 1 -H -P 2 --trace -c
My reproducer didn't reproduce the issue sadly...
Obtained behavior
I get a deadlock when processor 3 post starpu_mpi_data_cpy to recv it's data from 1. It has already been received but it cannot retrieve it.
Expected behavior
Run until the end.
Configuration
$ /home_nfs/blacostex/chameleon/starpu/configure --prefix=/home_nfs/blacostex/chameleon/starpu-install-oneapi2023.1_openmpi4.1.5.1 --enable-fast --enable-mpi --enable-maxcpus=128 --enable-max-sched-ctxs=32 --disable-build-doc --disable-build-examples --disable-build-tests --disable-starpufft --disable-mlr --disable-hdf5 --disable-fortran --disable-cuda --enable-maxcudadev=0 --disable-opencl --enable-maxopencldev=0 --e
nable-maxmaxfpgadev=0 --disable-simgrid --disable-parallel-worker --disable-simgrid --disable-build-examples --enable-fxt
Distribution
RHEL 8.8
Version of StarPU
master (382f97a6)
Version of GPU drivers
No GPUs
Non reproducing reproducer:
/* StarPU --- Runtime system for heterogeneous multicore architectures.
*
* Copyright (C) 2015-2024 Université de Bordeaux, CNRS (LaBRI UMR 5800), Inria
*
* StarPU is free software; you can redistribute it and/or modify
* it under the terms of the GNU Lesser General Public License as published by
* the Free Software Foundation; either version 2.1 of the License, or (at
* your option) any later version.
*
* StarPU is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*
* See the GNU Lesser General Public License in COPYING.LGPL for more details.
*/
#include <starpu_mpi.h>
#include "helper.h"
#define DATA_TAG 666
#define BUFFER_SIZE 663552
#define INC_COUNT 10
#define LET_STARPU_ALLOCATE 1
void init_cpu(void *descr[], void *_args)
{
int i, rank;
int *value = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
starpu_codelet_unpack_args(_args, &rank);
for (i=0; i < BUFFER_SIZE; i++)
value[i] = rank*BUFFER_SIZE + i;
}
struct starpu_codelet initcodelet =
{
.cpu_funcs = {init_cpu},
.nbuffers = 1,
.modes = {STARPU_W},
.model = &starpu_perfmodel_nop,
.name = "init"
};
void print_cpu(void *descr[], void *_args)
{
int i, rank;
int *value = (int *)STARPU_VECTOR_GET_PTR(descr[0]);
starpu_codelet_unpack_args(_args, &rank);
}
struct starpu_codelet printcodelet =
{
.cpu_funcs = {print_cpu},
.nbuffers = 1,
.modes = {STARPU_W},
.model = &starpu_perfmodel_nop,
.name = "print"
};
void getrf_cpu(void *descr[], void *_args)
{
int i, rank;
starpu_codelet_unpack_args(_args, &rank);
fprintf(stderr, "[%d] GETRF \n", rank);
sleep(1);
}
struct starpu_codelet getrfcodelet =
{
.cpu_funcs = {getrf_cpu},
.nbuffers = 1,
.modes = {STARPU_RW},
.model = &starpu_perfmodel_nop,
.name = "getrf"
};
void trsm_cpu(void *descr[], void *_args)
{
int i, rank;
starpu_codelet_unpack_args(_args, &rank);
fprintf(stderr, "[%d] TRSM \n", rank);
sleep(1);
}
struct starpu_codelet trsmcodelet =
{
.cpu_funcs = {trsm_cpu},
.nbuffers = 2,
.modes = {STARPU_R, STARPU_W},
.model = &starpu_perfmodel_nop,
.name = "trsm"
};
void gemm_cpu(void *descr[], void *_args)
{
int i, rank;
starpu_codelet_unpack_args(_args, &rank);
fprintf(stderr, "[%d] GEMM \n", rank);
sleep(1);
}
struct starpu_codelet gemmcodelet =
{
.cpu_funcs = {gemm_cpu},
.nbuffers = 3,
.modes = {STARPU_R, STARPU_R, STARPU_W},
.model = &starpu_perfmodel_nop,
.name = "gemm"
};
int main(int argc, char **argv)
{
int size, rank;
int ret;
int *value = malloc(BUFFER_SIZE*sizeof(int));
int *lvalue = malloc(BUFFER_SIZE*sizeof(int));
starpu_data_handle_t *data;
int mpi_init;
int i;
MPI_INIT_THREAD(&argc, &argv, MPI_THREAD_SERIALIZED, &mpi_init);
ret = starpu_mpi_init_conf(&argc, &argv, mpi_init, MPI_COMM_WORLD, NULL);
STARPU_CHECK_RETURN_VALUE(ret, "starpu_mpi_init_conf");
starpu_mpi_comm_rank(MPI_COMM_WORLD, &rank);
starpu_mpi_comm_size(MPI_COMM_WORLD, &size);
STARPU_ASSERT_MSG(rank == 4, "This test requires 4 ranks\n");
data = (starpu_data_handle_t*)malloc(size*sizeof(starpu_data_handle_t));
starpu_data_handle_t local_data;
starpu_vector_data_register(&local_data, STARPU_MAIN_RAM, (uintptr_t)lvalue, BUFFER_SIZE, sizeof(int));
for(i=0; i<size; i++)
{
if (i == rank && 0)
starpu_vector_data_register(&data[i], STARPU_MAIN_RAM, (uintptr_t)value, BUFFER_SIZE, sizeof(int));
else
starpu_vector_data_register(&data[i], -1, (uintptr_t)NULL, BUFFER_SIZE, sizeof(int));
starpu_mpi_data_register_comm(data[i], DATA_TAG + i, i, MPI_COMM_WORLD);
}
starpu_mpi_task_insert(MPI_COMM_WORLD, &initcodelet, STARPU_W, data[rank], STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
starpu_task_insert( &initcodelet, STARPU_W, local_data, STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
// GETRF BCAST 0 -> [1,2]
starpu_mpi_task_insert(MPI_COMM_WORLD, &getrfcodelet, STARPU_RW, data[0], STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
if (rank == 0 || rank == 1) {
fprintf(stderr, "[%d] 0-> 1\n", rank);
starpu_mpi_data_cpy(data[1],data[0], MPI_COMM_WORLD, 0, NULL, NULL);
fprintf(stderr, "[%d] 0-> 1 done\n", rank);
}
if (rank == 0 || rank == 2) {
fprintf(stderr, "[%d] 0-> 2\n", rank);
starpu_mpi_data_cpy(data[2],data[0], MPI_COMM_WORLD, 0, NULL, NULL);
fprintf(stderr, "[%d] 0-> 2 done\n", rank);
}
if (rank==2)
starpu_task_insert( &trsmcodelet, STARPU_R, data[2], STARPU_RW, local_data, STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
// TRSM BCAST 2 -> 3
if (rank == 3 || rank == 2) {
fprintf(stderr, "[%d] 2-> 3\n", rank);
starpu_mpi_data_cpy(data[3],data[2], MPI_COMM_WORLD, 0, NULL, NULL);
fprintf(stderr, "[%d] 2-> 3 done\n", rank);
}
// TRSM BCAST 1 -> 3
if (rank==1)
starpu_task_insert( &trsmcodelet, STARPU_R, data[1], STARPU_RW, local_data, STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
if (rank == 3 || rank == 1) {
fprintf(stderr, "[%d] 1-> 3\n", rank);
starpu_mpi_data_cpy(data[3],data[1], MPI_COMM_WORLD, 0, NULL, NULL);
fprintf(stderr, "[%d] 1-> 3 done\n", rank);
}
if (rank==3)
starpu_task_insert(&gemmcodelet, STARPU_R, data[3], STARPU_R, data[3], STARPU_RW, local_data, STARPU_VALUE, &rank, sizeof(rank), STARPU_TASK_SYNCHRONOUS, 1, 0);
starpu_mpi_data_cpy(data[1],data[3], MPI_COMM_WORLD, 0, NULL, NULL);
starpu_mpi_data_cpy(data[2],data[3], MPI_COMM_WORLD, 0, NULL, NULL);
starpu_data_cpy(local_data, data[rank], 0, NULL, NULL);
starpu_task_wait_for_all();
for(i=0; i<size; i++)
{
starpu_data_unregister(data[i]);
}
starpu_data_unregister(local_data);
FPRINTF_MPI(stderr, "value after calculation: %d (expected %d)\n", lvalue[0], STARPU_TASK_SYNCHRONOUS, 1, 0);
STARPU_ASSERT_MSG(lvalue[0] == 0, "[rank %d] value %d is not the expected value %d\n", rank, lvalue[0], STARPU_TASK_SYNCHRONOUS, 1, 0);
starpu_mpi_shutdown();
if (!mpi_init)
MPI_Finalize();
return 0;
}