diff --git a/ChangeLog b/ChangeLog index 1328ff7e129c969c0b5d994560fa539135a726ec..23cd6845ae17427484f1122274166e723f43554b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ chameleon-1.4.0 ------------------------------------------------------------------------ + - StarPU: When using starpu > 1.4.8, use the new distributed submit interface in the codelets instead of the classical insert task interface. + - ci: use -Werror to prevent from adding warning to the code + - Fix: all warning reported by the switch to -Werror chameleon-1.3.0 ------------------------------------------------------------------------ diff --git a/control/async.c b/control/async.c index d933e381789144622826025f79a1737a3a57d3dc..d877bca7d9d5a0f883eea642dd39565690fad15c 100644 --- a/control/async.c +++ b/control/async.c @@ -48,6 +48,9 @@ int chameleon_sequence_create(CHAM_context_t *chamctxt, RUNTIME_sequence_t **seq return CHAMELEON_ERR_OUT_OF_RESOURCES; } + (*sequence)->comm = chamctxt->comm; + (*sequence)->myrank = RUNTIME_comm_rank( chamctxt ); + RUNTIME_sequence_create( chamctxt, *sequence ); (*sequence)->status = CHAMELEON_SUCCESS; diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h index cbedc48a172b5993e08003c5e9ba02cac74addfe..d2122d0838f375986641b8cb922f431a48eabbdf 100644 --- a/include/chameleon/runtime_struct.h +++ b/include/chameleon/runtime_struct.h @@ -84,9 +84,10 @@ typedef struct runtime_request_s { */ typedef struct runtime_sequence_s { int status; /**< Return status registered by the tasks for the request */ + int myrank; /**< MPI Comm rank within the associated communicator */ RUNTIME_request_t *request; /**< Pointer to the request that failed if any, NULL otherwise */ void *schedopt; /**< Specific runtime data pointer to handle the sequence */ - MPI_Comm comm; /**< MPI communicator */ + MPI_Comm comm; /**< MPI communicator */ } RUNTIME_sequence_t; /** diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 801a1f89a5c037f9cdd6ea48400d8d81a7538a01..b7313a36f2a9c5ac217ca3d45f6d5b4fa78f7145 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -180,6 +180,19 @@ int main() { if ( HAVE_STARPU_MPI_DATA_CPY_PRIORITY ) message("-- ${Blue}Add definition HAVE_STARPU_MPI_DATA_CPY_PRIORITY${ColourReset}") endif() + + check_function_exists(starpu_mpi_exchange_data_before_execution HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION) + if ( HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION ) + message("-- ${Blue}Add definition HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION${ColourReset}") + endif() + + endif() + + if ( CHAMELEON_USE_MPI AND NOT HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION ) + set( CHAMELEON_STARPU_USE_INSERT ON CACHE BOOL "Enable the task insert interface instead of the task submit" FORCE) + else() + option( CHAMELEON_STARPU_USE_INSERT + "Enable the task insert interface instead of the task submit" OFF ) endif() if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION) diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index b276d515708d9395652e7f1ecac567a0d56bceec..dfdb27fc4666b2fa84c63eb7621224b9a5cad256 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -25,23 +25,27 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zgeadd_args_s { + cham_trans_t trans; + int m; + int n; + CHAMELEON_Complex64_t alpha; + CHAMELEON_Complex64_t beta; +}; + #if !defined(CHAMELEON_SIMULATION) static void cl_zgeadd_cpu_func( void *descr[], void *cl_arg ) { - cham_trans_t trans; - int M; - int N; - CHAMELEON_Complex64_t alpha; - CHAM_tile_t *tileA; - CHAMELEON_Complex64_t beta; - CHAM_tile_t *tileB; + struct cl_zgeadd_args_s *clargs = (struct cl_zgeadd_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); - TCORE_zgeadd( trans, M, N, alpha, tileA, beta, tileB ); + TCORE_zgeadd( clargs->trans, clargs->m, clargs->n, + clargs->alpha, tileA, clargs->beta, tileB ); return; } @@ -50,22 +54,17 @@ cl_zgeadd_cpu_func( void *descr[], void *cl_arg ) static void cl_zgeadd_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_trans_t trans; - int M; - int N; - cuDoubleComplex alpha; - CHAM_tile_t *tileA; - cuDoubleComplex beta; - CHAM_tile_t *tileB; + struct cl_zgeadd_args_s *clargs = (struct cl_zgeadd_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get( descr[0] ); tileB = cti_interface_get( descr[1] ); - starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); - CUDA_zgeadd( trans, M, N, - &alpha, tileA->mat, tileA->ld, - &beta, tileB->mat, tileB->ld, + CUDA_zgeadd( clargs->trans, clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, + (cuDoubleComplex*)&(clargs->beta), tileB->mat, tileB->ld, handle ); return; @@ -78,6 +77,7 @@ cl_zgeadd_cuda_func( void *descr[], void *cl_arg ) */ CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC ) +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, @@ -89,28 +89,124 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, return; } - struct starpu_codelet *codelet = &cl_zgeadd; - void (*callback)(void*) = options->profiling ? cl_zgeadd_callback : NULL; - int accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + void (*callback)(void*); + struct cl_zgeadd_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zgeadd"; + int accessB; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgeadd_args_s ) ); + clargs->trans = trans; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback fro profiling information */ + callback = options->profiling ? cl_zgeadd_callback : NULL; + + /* Reduce the B access if needed */ + accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessB, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zgeadd, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgeadd_args_s), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + accessB, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_NAME, cl_name, 0 ); (void)nb; } + +#else + +void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, + cham_trans_t trans, int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, B, Bm, Bn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zgeadd, 2 ); + int accessB; + + /* Reduce the B access if needed */ + accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, accessB ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgeadd_args_s ) ); + clargs->trans = trans; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgeadd_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgeadd_callback ); + + /* Flops */ + //task->flops = flops_zgeadd( m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zgeadd", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index d4895c5daaa5999eba42e89e2cd2b6db0fbb59a9..1598f7ebcfbf83ed73cce710b7fe6150ffac1c85 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -126,6 +126,7 @@ CODELETS_GPU( zgemm, cl_zgemm_cpu_func, cl_zgemm_hip_func, STARPU_HIP_ASYNC ) CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, @@ -139,12 +140,12 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, return; } - struct cl_zgemm_args_s *clargs = NULL; void (*callback)(void*); - int accessC; + struct cl_zgemm_args_s *clargs = NULL; int exec = 0; const char *cl_name = "zgemm_Astat"; uint32_t where = cl_zgemm.where; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -229,12 +230,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, return; } - struct cl_zgemm_args_s *clargs = NULL; void (*callback)(void*); - int accessC; - int exec = 0; + struct cl_zgemm_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "zgemm"; uint32_t where = cl_zgemm.where; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -295,3 +296,133 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, STARPU_EXECUTE_WHERE, where, 0 ); } + +#else + +void __INSERT_TASK_zgemm( const RUNTIME_option_t *options, + int xrank, int accessC, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == (CHAMELEON_Complex64_t)0. ) { + INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zgemm, 3 ); + + /* + * Register the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, xrank ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */ +#if defined(PRECISION_z) || defined(PRECISION_c) + if ( k == 1 ) { + task->where = STARPU_CPU; + } +#endif + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zgemm_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgemm_callback ); + + /* Flops */ + task->flops = flops_zgemm( m, n, k ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW; + +#if defined(HAVE_STARPU_MPI_REDUX) + if ( beta == (CHAMELEON_Complex64_t)1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + + __INSERT_TASK_zgemm( options, + A->get_rankof( A, Am, An ), accessC, + transA, transB, m, n, k, nb, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} + +void INSERT_TASK_zgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : + (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0)); + + __INSERT_TASK_zgemm( options, + C->get_rankof( C, Cm, Cn ), accessC, + transA, transB, m, n, k, nb, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} +#endif diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 5210970da0610040b9285b964764c24faa95bc37..ce592b7c818785a3615ef8c0a95a38e2856f7ec9 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -28,10 +28,10 @@ #include "runtime_codelet_z.h" struct cl_zhemm_args_s { - cham_side_t side; - cham_uplo_t uplo; - int m; - int n; + cham_side_t side; + cham_uplo_t uplo; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; }; @@ -125,6 +125,7 @@ CODELETS_GPU( zhemm, cl_zhemm_cpu_func, cl_zhemm_hip_func, STARPU_HIP_ASYNC ) CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, @@ -138,11 +139,11 @@ void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, return; } - struct cl_zhemm_args_s *clargs = NULL; void (*callback)(void*); - int accessC; + struct cl_zhemm_args_s *clargs = NULL; int exec = 0; const char *cl_name = "zhemm_Astat"; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -272,3 +273,122 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, STARPU_NAME, cl_name, 0 ); } + +#else + +void __INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, int xrank, int accessC, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == (CHAMELEON_Complex64_t)0. ) { + INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zhemm, 3 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, xrank ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zhemm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zhemm_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zhemm_callback ); + + /* Flops */ + task->flops = flops_zhemm( side, m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW; + +#if defined(HAVE_STARPU_MPI_REDUX) + if ( beta == (CHAMELEON_Complex64_t)1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + + __INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + A->get_rankof( A, Am, An ), accessC, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} + +void INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : + (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0)); + + __INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + C->get_rankof( C, Cm, Cn ), accessC, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} +#endif diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 587f8cb87e2f739e6a378a97d9c59f60c6fa1710..7176bbe393513f4d9db74f48b19c7b7d63dabbef 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -27,53 +27,50 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zher2k_args_s { + cham_uplo_t uplo; + cham_trans_t trans; + int n; + int k; + CHAMELEON_Complex64_t alpha; + double beta; +}; + #if !defined(CHAMELEON_SIMULATION) static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) { - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - CHAMELEON_Complex64_t alpha; + struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - double beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - TCORE_zher2k(uplo, trans, - n, k, alpha, tileA, tileB, beta, tileC); + TCORE_zher2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, clargs->alpha, + tileA, tileB, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - cuDoubleComplex alpha; + struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - double beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - - CUDA_zher2k( uplo, trans, - n, k, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, + CUDA_zher2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &(clargs->beta), tileC->mat, tileC->ld, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ @@ -82,28 +79,21 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) static void cl_zher2k_hip_func(void *descr[], void *cl_arg) { hipblasHandle_t handle = starpu_hipblas_get_local_handle(); - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - hipblasDoubleComplex alpha; + struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - double beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - - HIP_zher2k( uplo, trans, - n, k, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, - handle ); + HIP_zher2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, + (hipblasDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &(clargs->beta), tileC->mat, tileC->ld, + handle ); } #endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -117,18 +107,13 @@ CODELETS_GPU( zher2k, cl_zher2k_cpu_func, cl_zher2k_hip_func, STARPU_HIP_ASYNC ) CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC ) #endif -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void -INSERT_TASK_zher2k( const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - double beta, const CHAM_desc_t *C, int Cm, int Cn ) +#if defined(CHAMELEON_STARPU_USE_INSERT) +void INSERT_TASK_zher2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { INSERT_TASK_zlascal( options, uplo, n, n, nb, @@ -136,30 +121,139 @@ INSERT_TASK_zher2k( const RUNTIME_option_t *options, return; } - (void)nb; - struct starpu_codelet *codelet = &cl_zher2k; - void (*callback)(void*) = options->profiling ? cl_zher2k_callback : NULL; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + void (*callback)(void*); + struct cl_zher2k_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zher2k"; + int accessC; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zher2k_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback fro profiling information */ + callback = options->profiling ? cl_zher2k_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), - STARPU_VALUE, &beta, sizeof(double), - accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zher2k, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zher2k_args_s), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, + STARPU_NAME, cl_name, 0 ); + + (void)nb; +} + +#else + +void INSERT_TASK_zher2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zher2k, 3 ); + int accessC; + + /* Reduce the C access if needed */ + accessC = ( beta == (double)0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zher2k_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zher2k_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zher2k_callback ); + + /* Flops */ + task->flops = flops_zher2k( k, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zher2k", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; } + +#endif diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index bcd9eb1ff2195d223dfbb10770ceb9e3e2b4872f..2a3eda14079e069ec08f7c090ce6bf9cab00cebb 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -29,12 +29,12 @@ #include "runtime_codelet_z.h" struct cl_zherk_args_s { - cham_uplo_t uplo; + cham_uplo_t uplo; cham_trans_t trans; - int n; - int k; - double alpha; - double beta; + int n; + int k; + double alpha; + double beta; }; #if !defined(CHAMELEON_SIMULATION) @@ -109,6 +109,7 @@ CODELETS_GPU( zherk, cl_zherk_cpu_func, cl_zherk_hip_func, STARPU_HIP_ASYNC ) CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -121,11 +122,11 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, return; } - struct cl_zherk_args_s *clargs = NULL; void (*callback)(void*); - int accessC; - int exec = 0; - const char *cl_name = "zherk"; + struct cl_zherk_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zherk"; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -173,3 +174,83 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_zherk( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zherk, 2 ); + int accessC; + + /* Reduce the C access if needed */ + accessC = ( beta == (double)0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zherk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zherk_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zherk_callback ); + + /* Flops */ + task->flops = flops_zherk( k, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zherk", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index 35fb10c358ec7e62610948cb79c306845641a373..c984f574d114672a038999219a54c7eb1cb32008 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -25,9 +25,9 @@ #include "runtime_codelet_z.h" struct cl_zlascal_args_s { - cham_uplo_t uplo; - int m; - int n; + cham_uplo_t uplo; + int m; + int n; CHAMELEON_Complex64_t alpha; }; @@ -49,6 +49,7 @@ cl_zlascal_cpu_func( void *descr[], void *cl_arg ) */ CODELETS_CPU( zlascal, cl_zlascal_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zlascal( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, @@ -64,10 +65,10 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, return; } - struct cl_zlascal_args_s *clargs = NULL; void (*callback)(void*); - int exec = 0; - const char *cl_name = "zlascal"; + struct cl_zlascal_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zlascal"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -102,3 +103,78 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_zlascal( const RUNTIME_option_t *options, + cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, + const CHAM_desc_t *A, int Am, int An ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlaset( options, uplo, m, n, + alpha, alpha, A, Am, An ); + return; + } + else if ( alpha == 1. ) { + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zlascal, 1 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, A->get_rankof( A, Am, An ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_RW ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zlascal_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zlascal_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zlascal_callback ); + + /* Flops */ + //task->flops = flops_zlascal( uplo, m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, + A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zlascal", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index ee38d7b92d7a67132ddd6a7eea4065a1a317c816..46cd4d7a616ec6113e05d19e60fb360e7c4f1752 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -29,11 +29,11 @@ #include "runtime_codelet_z.h" struct cl_zpotrf_args_s { - cham_uplo_t uplo; - int n; - int iinfo; + cham_uplo_t uplo; + int n; + int iinfo; RUNTIME_sequence_t *sequence; - RUNTIME_request_t *request; + RUNTIME_request_t *request; }; #if !defined(CHAMELEON_SIMULATION) @@ -65,14 +65,15 @@ CODELETS( zpotrf, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, STARPU_CUDA_ASYNC ) CODELETS_CPU( zpotrf, cl_zpotrf_cpu_func ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - struct cl_zpotrf_args_s *clargs = NULL; void (*callback)(void*); - int exec = 0; + struct cl_zpotrf_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "zpotrf"; /* Handle cache */ @@ -114,3 +115,69 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, + cham_uplo_t uplo, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + int iinfo ) +{ + INSERT_TASK_COMMON_PARAMETERS( zpotrf, 1 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, A->get_rankof( A, Am, An ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_RW ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zpotrf_args_s ) ); + clargs->uplo = uplo; + clargs->n = n; + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zpotrf_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zpotrf_callback ); + + /* Flops */ + task->flops = flops_zpotrf( n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 1, + A->get_blktile( A, Am, An ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 5882380fc36964275359a39695c29b8f54c4b344..c3ab90117f31d681fb10c20ec42ad41d63981fe9 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -29,10 +29,10 @@ #include "runtime_codelet_z.h" struct cl_zsymm_args_s { - cham_side_t side; - cham_uplo_t uplo; - int m; - int n; + cham_side_t side; + cham_uplo_t uplo; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; }; @@ -126,6 +126,7 @@ CODELETS_GPU( zsymm, cl_zsymm_cpu_func, cl_zsymm_hip_func, STARPU_HIP_ASYNC ) CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, int m, int n, int nb, @@ -139,11 +140,11 @@ void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, return; } - struct cl_zsymm_args_s *clargs = NULL; void (*callback)(void*); - int accessC; + struct cl_zsymm_args_s *clargs = NULL; int exec = 0; const char *cl_name = "zsymm_Astat"; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -219,11 +220,11 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, return; } - struct cl_zsymm_args_s *clargs = NULL; void (*callback)(void*); - int accessC; - int exec = 0; + struct cl_zsymm_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "zsymm"; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -273,3 +274,122 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, STARPU_NAME, cl_name, 0 ); } + +#else + +void __INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, int xrank, int accessC, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == (CHAMELEON_Complex64_t)0. ) { + INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zsymm, 3 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, xrank ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zsymm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zsymm_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsymm_callback ); + + /* Flops */ + task->flops = flops_zsymm( side, m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW; + +#if defined(HAVE_STARPU_MPI_REDUX) + if ( beta == (CHAMELEON_Complex64_t)1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + + __INSERT_TASK_zsymm( options, side, uplo, m, n, nb, + A->get_rankof( A, Am, An ), accessC, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} + +void INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* Reduce the C access if needed */ + int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : + (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0)); + + __INSERT_TASK_zsymm( options, side, uplo, m, n, nb, + C->get_rankof( C, Cm, Cn ), accessC, + alpha, A, Am, An, + B, Bm, Bn, + beta, C, Cm, Cn ); +} +#endif diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 179c2024e6bb13228e87830c46766bbae96002c5..f8675eceec9a1032a3bd5b61b717eb39e45d64db 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -27,53 +27,51 @@ #include "chameleon_starpu_internal.h" #include "runtime_codelet_z.h" +struct cl_zsyr2k_args_s { + cham_uplo_t uplo; + cham_trans_t trans; + int n; + int k; + CHAMELEON_Complex64_t alpha; + CHAMELEON_Complex64_t beta; +}; + + #if !defined(CHAMELEON_SIMULATION) static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) { - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - CHAMELEON_Complex64_t alpha; + struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - TCORE_zsyr2k(uplo, trans, - n, k, alpha, tileA, tileB, beta, tileC); + TCORE_zsyr2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, clargs->alpha, + tileA, tileB, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - cuDoubleComplex alpha; + struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - - CUDA_zsyr2k( uplo, trans, - n, k, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, + CUDA_zsyr2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ @@ -82,28 +80,21 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) static void cl_zsyr2k_hip_func(void *descr[], void *cl_arg) { hipblasHandle_t handle = starpu_hipblas_get_local_handle(); - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; - hipblasDoubleComplex alpha; + struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; - hipblasDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - - HIP_zsyr2k( uplo, trans, - n, k, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, - handle ); + HIP_zsyr2k( clargs->uplo, clargs->trans, + clargs->n, clargs->k, + (hipblasDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (hipblasDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, + handle ); } #endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -117,17 +108,13 @@ CODELETS_GPU( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_hip_func, STARPU_HIP_ASYNC ) CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC ) #endif -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +#if defined(CHAMELEON_STARPU_USE_INSERT) +void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { INSERT_TASK_zlascal( options, uplo, n, n, nb, @@ -135,30 +122,139 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, return; } - (void)nb; - struct starpu_codelet *codelet = &cl_zsyr2k; - void (*callback)(void*) = options->profiling ? cl_zsyr2k_callback : NULL; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + void (*callback)(void*); + struct cl_zsyr2k_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zsyr2k"; + int accessC; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback fro profiling information */ + callback = options->profiling ? cl_zsyr2k_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + rt_starpu_insert_task( - codelet, - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &trans, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &k, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zsyr2k, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyr2k_args_s), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, + STARPU_NAME, cl_name, 0 ); + + (void)nb; } + +#else + +void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zsyr2k, 3 ); + int accessC; + + /* Reduce the C access if needed */ + accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zsyr2k_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsyr2k_callback ); + + /* Flops */ + task->flops = flops_zsyr2k( k, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zsyr2k", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index e92f2b85f496c9f551f082b6c64a9fd966d892e7..32ded083dbad46d6a9a2ce2b6504e3766a7aac9b 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -30,10 +30,10 @@ #include "runtime_codelet_z.h" struct cl_zsyrk_args_s { - cham_uplo_t uplo; - cham_trans_t trans; - int n; - int k; + cham_uplo_t uplo; + cham_trans_t trans; + int n; + int k; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; }; @@ -110,6 +110,7 @@ CODELETS_GPU( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_hip_func, STARPU_HIP_ASYNC ) CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -122,11 +123,11 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, return; } - struct cl_zsyrk_args_s *clargs = NULL; void (*callback)(void*); - int accessC; - int exec = 0; - const char *cl_name = "zsyrk"; + struct cl_zsyrk_args_s *clargs = NULL; + int exec = 0; + const char *cl_name = "zsyrk"; + int accessC; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -174,3 +175,83 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, + int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, uplo, n, n, nb, + beta, C, Cm, Cn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( zsyrk, 2 ); + int accessC; + + /* Reduce the C access if needed */ + accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, C->get_rankof( C, Cm, Cn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_zsyrk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_zsyrk_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsyrk_callback ); + + /* Flops */ + task->flops = flops_zsyrk( k, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + C->get_blktile( C, Cm, Cn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_zsyrk", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index a3f695f2c685eb05e66525c6fca957af247dbe42..39c7de9667a60b79bd3ee9eb2e04482c1e67f994 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -24,10 +24,10 @@ #include "runtime_codelet_z.h" struct cl_ztradd_args_s { - cham_uplo_t uplo; - cham_trans_t trans; - int m; - int n; + cham_uplo_t uplo; + cham_trans_t trans; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAMELEON_Complex64_t beta; }; @@ -53,6 +53,7 @@ cl_ztradd_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU( ztradd, cl_ztradd_cpu_func ) +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_ztradd( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, @@ -64,11 +65,11 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, return; } - struct cl_ztradd_args_s *clargs = NULL; void (*callback)(void*); - int accessB; - int exec = 0; + struct cl_ztradd_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "ztradd"; + int accessB; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -110,3 +111,83 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_ztradd( const RUNTIME_option_t *options, + cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *B, int Bm, int Bn ) +{ + if ( alpha == 0. ) { + INSERT_TASK_zlascal( options, uplo, m, n, nb, + beta, B, Bm, Bn ); + return; + } + + INSERT_TASK_COMMON_PARAMETERS( ztradd, 2 ); + int accessB; + + + /* Reduce the B access if needed */ + accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW; + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, accessB ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_ztradd_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_ztradd_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztradd_callback ); + + /* Flops */ + //task->flops = flops_ztradd( m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_ztradd", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 229de0304d8fd6b1be2199a25cf3231ea28ec9ca..d9d25ed28b1b5a275df649e124453a81ab93f082 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -28,12 +28,12 @@ #include "runtime_codelet_z.h" struct cl_ztrmm_args_s { - cham_side_t side; - cham_uplo_t uplo; - cham_trans_t transA; - cham_diag_t diag; - int m; - int n; + cham_side_t side; + cham_uplo_t uplo; + cham_trans_t transA; + cham_diag_t diag; + int m; + int n; CHAMELEON_Complex64_t alpha; }; @@ -106,15 +106,16 @@ CODELETS_GPU( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_hip_func, STARPU_HIP_ASYNC ) CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrmm_args_s *clargs = NULL; void (*callback)(void*); - int exec = 0; + struct cl_ztrmm_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "ztrmm"; /* Handle cache */ @@ -160,3 +161,74 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + INSERT_TASK_COMMON_PARAMETERS( ztrmm, 2 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_ztrmm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_ztrmm_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztrmm_callback ); + + /* Flops */ + task->flops = flops_ztrmm( side, m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_ztrmm", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 8d0656b9915417706d2564dce76e5c5b5b85bc97..6961500fd9518f8624d2fc03d7634695f1e97861 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -30,12 +30,12 @@ #include "runtime_codelet_z.h" struct cl_ztrsm_args_s { - cham_side_t side; - cham_uplo_t uplo; - cham_trans_t transA; - cham_diag_t diag; - int m; - int n; + cham_side_t side; + cham_uplo_t uplo; + cham_trans_t transA; + cham_diag_t diag; + int m; + int n; CHAMELEON_Complex64_t alpha; }; @@ -113,15 +113,16 @@ CODELETS_GPU( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_hip_func, STARPU_HIP_ASYNC ) CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC ) #endif +#if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, int m, int n, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrsm_args_s *clargs = NULL; void (*callback)(void*); - int exec = 0; + struct cl_ztrsm_args_s *clargs = NULL; + int exec = 0; const char *cl_name = "ztrsm"; /* Handle cache */ @@ -168,3 +169,74 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, (void)nb; } + +#else + +void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + INSERT_TASK_COMMON_PARAMETERS( ztrsm, 2 ); + + /* + * Set the data handles and initialize exchanges if needed + */ + starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R ); + starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_RW ); + + /* + * Not involved, let's return + */ + if ( nbdata == 0 ) { + return; + } + + if ( params.do_execute ) + { + int ret; + struct starpu_task *task = starpu_task_create(); + task->cl = cl; + + /* Set codelet parameters */ + clargs = malloc( sizeof( struct cl_ztrsm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + + task->cl_arg = clargs; + task->cl_arg_size = sizeof( struct cl_ztrsm_args_s ); + task->cl_arg_free = 1; + + /* Set common parameters */ + starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztrsm_callback ); + + /* Flops */ + task->flops = flops_ztrsm( side, m, n ); + + /* Refine name */ + task->name = chameleon_codelet_name( cl_name, 2, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ) ); + + ret = starpu_task_submit( task ); + if ( ret == -ENODEV ) { + task->destroy = 0; + starpu_task_destroy( task ); + chameleon_error( "INSERT_TASK_ztrsm", "Failed to submit the task to StarPU" ); + return; + } + } + + starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); + + (void)nb; +} + +#endif diff --git a/runtime/starpu/control/runtime_async.c b/runtime/starpu/control/runtime_async.c index 790181977e43c4d721bdb398f9a79701e0a5f1e7..848d0f7aa54291b4a86546ead770ad2f1c422f4a 100644 --- a/runtime/starpu/control/runtime_async.c +++ b/runtime/starpu/control/runtime_async.c @@ -24,11 +24,10 @@ /** * Create a sequence */ -int RUNTIME_sequence_create( CHAM_context_t *chamctxt, +int RUNTIME_sequence_create( CHAM_context_t *chamctxt, RUNTIME_sequence_t *sequence ) { (void)chamctxt; - sequence->comm = chamctxt->comm; return CHAMELEON_SUCCESS; } diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 2c285f33056c034220f273701823914b1d75c34f..d24dce7a70c5b9eacc2211c983a10fb34220a716 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -55,6 +55,9 @@ #cmakedefine HAVE_STARPU_MPI_REDUX #cmakedefine HAVE_STARPU_MPI_REDUX_WRAPUP #cmakedefine HAVE_STARPU_MPI_DATA_CPY_PRIORITY +#cmakedefine HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION + +#cmakedefine CHAMELEON_STARPU_USE_INSERT #if (!defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER) && !defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER)) && defined(CHAMELEON_USE_MPI_DATATYPES) #error "This version of StarPU does not support MPI datatypes (Please compile with -DCHAMELEON_USE_MPI_DATATYPES=OFF)" diff --git a/runtime/starpu/include/chameleon_starpu_internal.h b/runtime/starpu/include/chameleon_starpu_internal.h index 8b7c3ecd047f308787a9fbb25f755dd0ac0b3b89..977733dc9e72518c539f91a2a45074016ff49d28 100644 --- a/runtime/starpu/include/chameleon_starpu_internal.h +++ b/runtime/starpu/include/chameleon_starpu_internal.h @@ -28,6 +28,7 @@ #include "control/common.h" #include "chameleon_starpu.h" +#include "chameleon/flops.h" /* Chameleon interfaces for StarPU */ #include "cham_tile_interface.h" @@ -125,6 +126,18 @@ void RUNTIME_set_reduction_methods(starpu_data_handle_t handle, cham_flttype_t d #include "runtime_mpi.h" #include "runtime_wontuse.h" +static inline starpu_data_handle_t * +chameleon_starpu_data_gethandle( const CHAM_desc_t *A, int m, int n ) +{ + int64_t mm = m + (A->i / A->mb); + int64_t nn = n + (A->j / A->nb); + + starpu_data_handle_t *ptrtile = A->schedopt; + ptrtile += ((int64_t)A->lmt) * nn + mm; + + return ptrtile; +} + #if defined(CHAMELEON_USE_MPI) && defined(HAVE_STARPU_MPI_CACHED_RECEIVE) static inline int chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n) @@ -205,4 +218,272 @@ chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n) #define RUNTIME_END_ACCESS_DECLARATION \ RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION; +#define INSERT_TASK_COMMON_PARAMETERS( _name_, _nbuffer_ ) \ + struct starpu_data_descr descrs[_nbuffer_]; \ + struct starpu_mpi_task_exchange_params params; \ + struct cl_##_name_##_args_s *clargs = NULL; \ + struct starpu_codelet *cl = &cl_##_name_; \ + const char *cl_name = #_name_; \ + int nbdata = 0; + +/** + * This section defines the codelet functions to manage MPI cache and data + * echanges before and after submitting tasks + */ +#if !defined(CHAMELEON_STARPU_USE_INSERT) +#if !defined(CHAMELEON_USE_MPI) + +/** + * @brief Empty data structure to mimic the one provided by StarPU when MPI is enabled + */ +struct starpu_mpi_task_exchange_params { + int do_execute; +}; + +static inline void +starpu_cham_exchange_init_params( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params *params, + int xrank ) +{ + params->do_execute = 1; + (void)options; + (void)xrank; +} + +static inline void +starpu_cham_exchange_data_before_execution( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params params, + int *nbdata, + struct starpu_data_descr *descrs, + const CHAM_desc_t *A, + int Am, + int An, + enum starpu_data_access_mode mode ) +{ + descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An ); + descrs[*nbdata].mode = mode; + (*nbdata)++; + + (void)options; + (void)params; + return; +} + +#define starpu_cham_task_exchange_data_after_execution( ... ) do {} while(0) + +#else + +/** + * @brief Internal function to initialize the StarPU paramas structure. + * + * @param[in] options + * The runtime options used to set common informations such as + * communicator, rank, and priority. + * + * @param[in,out] params + * On entry, the allocated params structure. On exit the fields of the + * structure are initialized. + * + * @param[in] xrank + * The MPI rank that will execute the task. STARPU_MPI_PER_NODE if all + * nodes excute it. + * + */ +static inline void +starpu_cham_exchange_init_params( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params *params, + int xrank ) +{ + params->me = options->sequence->myrank; + params->xrank = xrank; + params->priority = options->priority; + params->do_execute = ( xrank == STARPU_MPI_PER_NODE ) || ( xrank == params->me ); +} + +/** + * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(), + * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros + * in other runtimes. + * + * @param[in] options + * The options to parameterize the task + * + * @param[in] params + * The starpu parameters for the exchange functions. Needs to be + * initialized by starpu_cham_init_exchange_param() function. + * + * @param[in,out] nbdata + * On entry the number of data already registered in descrs. On exist, + * the counter is updated if the next handle is registered in the + * structure. + * + * @param[in,out] descrs + * The array of starpu data descriptors (handle + mode). On entry, it + * is allcoated to the maximum number of data for the task, and + * contains the already registered nbdata handles and their associated + * modes. On exit, it is updated with the new handle if needed. + * + * @param[in] A + * The descriptor in which to find the piece of data + * + * @param[in] Am + * The row index of the piece of data + * + * @param[in] An + * The column index of the piece of data + * + * @param[in] mode + * The access mode + * + */ +static inline void +starpu_cham_exchange_data_before_execution( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params params, + int *nbdata, + struct starpu_data_descr *descrs, + const CHAM_desc_t *A, + int Am, + int An, + enum starpu_data_access_mode mode ) +{ + unsigned need_submit = 0; + starpu_data_handle_t *ptrtile = chameleon_starpu_data_gethandle( A, Am, An ); + + /* + * Manage local cache through internal function to avoid the creation of + * handles if not necessary + */ + if ( chameleon_desc_islocal( A, Am, An ) ) { + need_submit = 1; + } + else { + if ( *ptrtile && ( mode & STARPU_W ) && + starpu_mpi_cached_receive( *ptrtile ) ) + { + need_submit = 1; + } + } + if ( options->forcesub && ( mode & STARPU_MPI_REDUX ) ) + { + need_submit = 1; + } + + if ( !need_submit && !params.do_execute ) { + return; + } + + /* + * If we need to submit, let's create the data handle and ask StarPU to perform + * the necessary communications + */ + descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An ); + descrs[*nbdata].mode = mode; + + starpu_mpi_exchange_data_before_execution( + options->sequence->comm, descrs[*nbdata].handle, mode, params ); + + (*nbdata)++; + return; +} + +/** + * @brief Internal wrapper to starpu_mpi_task_exchange_data_after_execution(). + * + * @param[in] options + * The options to get the communicator. + * + * @param[in] params + * The structure that stores a few parameters initialized by + * starpu_mpi_task_exchange_data_before_execution(). + * + * @param[in] nbdata + * The size of the descr array. + * + * @param[in] descrs + * The array of the handle with their associated mode. The array is + * initialized in starpu_mpi_task_exchange_data_before_execution(). + * + */ +static inline void +starpu_cham_task_exchange_data_after_execution( const RUNTIME_option_t *options, + struct starpu_mpi_task_exchange_params params, + int nbdata, + struct starpu_data_descr *descrs ) +{ + starpu_mpi_task_exchange_data_after_execution( + options->sequence->comm, descrs, nbdata, params ); +} + +#endif + +typedef void (*callback_fct_t)(void *); + +/** + * @brief Internal function to initialize the task common parts. + * + * @param[in] options + * The runtime options used to set common informations such as + * communicator, rank, and priority. + * + * @param[in,out] task + * The task for which to complete the initialization + * + * @param[in] nbdata + * The size of the descr array. + * + * @param[in] descrs + * The array of the handle with their associated mode. The array is + * initialized in starpu_mpi_task_exchange_data_before_execution(). + * + * @param[in] callback + * The profiling callback function pointe used if profiling is enabled. + * + */ +static inline void +starpu_cham_task_set_options( const RUNTIME_option_t *options, + struct starpu_task *task, + int nbdata, + struct starpu_data_descr *descrs, + callback_fct_t callback ) +{ + int i; + + task->priority = options->priority; + +#if defined(CHAMELEON_RUNTIME_SYNC) + task->synchronous = 1; +#endif + + /* Callback for profiling information */ + if ( options->profiling ) { + task->callback_func = callback; + } + + /* Specific worker id */ + if ( options->workerid != -1 ) { + task->workerid = options->workerid; + task->execute_on_a_specific_worker = 1; + } + + /* Parallel tasks */ + task->possibly_parallel = options->parallel; + + /* Set the where here */ + // task->where; /* Do restriction here */ + + task->nbuffers = nbdata; + for ( i = 0; i < task->nbuffers; i++ ) { + enum starpu_data_access_mode mode = descrs[i].mode; + assert( descrs[i].handle ); + + if ( mode & STARPU_MPI_REDUX ) { + mode = STARPU_RW | STARPU_COMMUTE; + } + + STARPU_TASK_SET_HANDLE( task, descrs[i].handle, i ); + STARPU_TASK_SET_MODE( task, mode, i ); + } +} +#endif /* !defined(CHAMELEON_STARPU_USE_INSERT) */ + #endif /* _chameleon_starpu_internal_h_ */