diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index f444409b8279007d43d810f6b603595c414f9819..90b4578d47eadeab248d9d47cb45a4a93b74a1b2 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -25,7 +25,7 @@ * @author Romain Peressoni * @author Matthieu Kuhn * @author Ana Hourcau - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -188,12 +188,12 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, void INSERT_TASK_zlaset2( const RUNTIME_option_t *options, cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *tileA, int tileAm, int tileAn ); -void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, +void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *tIPIV, int tIPIVk, const CHAM_desc_t *tileA, int tileAm, int tileAn, const CHAM_desc_t *tileB, int tileBm, int tileBn ); -void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, +void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *tIPIV, int tIPIVk, const CHAM_desc_t *tileA, int tileAm, int tileAn, @@ -588,15 +588,20 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile - * U(Um, Un) according to the permutation ipiv. This task is used in the LU - * factorization with partial pivoting. + * @brief Perfoms an allreduce operation on the tile + * U(Um, Un) according to the permutation ipiv. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] A * The descriptor of the matrix A. * @@ -630,6 +635,7 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -645,9 +651,9 @@ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus - * involved in the permutation. This task is used in the LU factorization with - * partial pivoting. + * @brief Sends the tile A(Am, An) to the processus + * involved in the permutation. This task is used in the LU factorization with + * partial pivoting. * ******************************************************************************* * @@ -687,15 +693,20 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the - * processus involved in the permutation. This task is used in the LU - * factorization with partial pivoting. + * @brief - Sends the permutation ipivk to the + * processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -715,6 +726,7 @@ void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -726,15 +738,20 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, * * @ingroup CHAMELEON_Complex64_t * - * INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk - * to the processus involved in the permutation. This task is used in the LU - * factorization with partial pivoting. + * @brief Sends the inverse permutation ipivk + * to the processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. * ******************************************************************************* * * @param[in] options * The runtime options data structure to pass through all insert_task calls. * + * @param[in] dir + * Specifies the order of the permutation. + * = ChamDirForward: Natural order + * = ChamDirBackward: Reverse order + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -754,6 +771,7 @@ void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, ******************************************************************************* */ void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -761,3 +779,4 @@ void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ); #endif /* _chameleon_tasks_z_h_ */ + diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c index bce58c771ef3052ce4d20d16232082cd9a746f66..93bf20aef11964fa548adb7739b000af575b04ba 100644 --- a/runtime/openmp/codelets/codelet_zlaswp.c +++ b/runtime/openmp/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -20,7 +20,7 @@ #include "coreblas/coreblas_ztile.h" void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -38,10 +38,11 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, } (void)options; + (void)dir; } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -59,4 +60,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, } (void)options; + (void)dir; } diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c index eac34fdfd1f8a0814c277f7acb8a9b85cb594ec7..8b20a60fd43332dac7373edcb2de40ee552d050a 100644 --- a/runtime/openmp/codelets/codelet_zperm_allreduce.c +++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c index 12aaf7089ff41f4e4090e0fb6f18e518c9813fd3..65849c96d1aae96cc1000dd93e5efbebe481c7d9 100644 --- a/runtime/parsec/codelets/codelet_zlaswp.c +++ b/runtime/parsec/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -33,7 +33,7 @@ CORE_zlaswp_get_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -54,6 +54,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, sizeof(int), &(tileU->ld), VALUE, PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT, PARSEC_DTD_ARG_END ); + + (void)dir; } static inline int @@ -70,7 +72,7 @@ CORE_zlaswp_set_parsec( parsec_execution_stream_t *context, } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -91,4 +93,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, sizeof(int), &(tileB->ld), VALUE, PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ), chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT, PARSEC_DTD_ARG_END ); + + (void)dir; } diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c index 9ceb440c8a4e677630a68355daa7defda7f904fa..f68148e24b5e4c4e7d42d7248c8bf4a9948477c6 100644 --- a/runtime/parsec/codelets/codelet_zperm_allreduce.c +++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c index 176dd16916eb51e1b698ad0d17dbd0d37c1a1d61..8f5a1b57fd52bd2e401273171584ebcca1478e50 100644 --- a/runtime/quark/codelets/codelet_zlaswp.c +++ b/runtime/quark/codelets/codelet_zlaswp.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Mathieu Faverge - * @date 2024-02-18 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -30,7 +30,7 @@ static void CORE_zlaswp_get_quark( Quark *quark ) } void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) @@ -46,6 +46,8 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT, 0 ); + + (void)dir; } static void CORE_zlaswp_set_quark( Quark *quark ) @@ -59,7 +61,7 @@ static void CORE_zlaswp_set_quark( Quark *quark ) } void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) @@ -75,4 +77,6 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT, sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT, 0 ); + + (void)dir; } diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c index f297d343b33455ba6340f0b81c45e8d01d29600f..1a2a7089c8addc5715d074a6c04bc5e8732aed1b 100644 --- a/runtime/quark/codelets/codelet_zperm_allreduce.c +++ b/runtime/quark/codelets/codelet_zperm_allreduce.c @@ -11,7 +11,7 @@ * * @version 1.3.0 * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -38,6 +38,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, @@ -45,6 +46,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, int *proc_involved ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)myrank; @@ -54,6 +56,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -61,6 +64,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, int n ) { (void)options; + (void)dir; (void)ipiv; (void)ipivk; (void)A; @@ -70,6 +74,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -81,6 +86,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, void *ws ) { (void)options; + (void)dir; (void)A; (void)U; (void)Um; diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c index 81c28d92f05d6c23e85e743b8402b79db31815b1..3829763abd896ca9db917a9d0573ac4d9b9b5255 100644 --- a/runtime/starpu/codelets/codelet_zlaswp.c +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -13,7 +13,7 @@ * @author Mathieu Faverge * @author Matthieu Kuhn * @author Alycia Lisito - * @date 2024-11-12 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -48,11 +48,12 @@ CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) { + void *ipiv_handle; struct starpu_codelet *codelet = &cl_zlaswp_get; if ( A->get_rankof( A, Am, An) != A->myrank ) { return; @@ -63,12 +64,18 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, clargs->m0 = m0; clargs->k = k; + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL; rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), STARPU_PRIORITY, options->priority, @@ -80,18 +87,26 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, #else /* defined(CHAMELEON_STARPU_USE_INSERT) */ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *U, int Um, int Un ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle; if ( A->get_rankof( A, Am, An) != A->myrank ) { return; } + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_get, zlaswp_get, zlaswp, 3); /* @@ -99,8 +114,7 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, */ starpu_cham_exchange_init_params( options, ¶ms, U->get_rankof( U, Um, Un ) ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_perm_getaddr( ipiv, ipivk ), - STARPU_R ); + ipiv_handle, STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ), STARPU_RW | STARPU_COMMUTE ); @@ -157,12 +171,14 @@ static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg ) CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func ) #if defined(CHAMELEON_STARPU_USE_INSERT) + void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { + void *ipiv_handle; struct starpu_codelet *codelet = &cl_zlaswp_set; if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { return; @@ -173,12 +189,19 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, clargs->m0 = m0; clargs->k = k; + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL; rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaswp_args_s), - STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, @@ -186,20 +209,30 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, STARPU_EXECUTE_ON_WORKER, options->workerid, 0 ); } -#else + +#else /* defined(CHAMELEON_STARPU_USE_INSERT) */ + void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, - int m0, int k, + cham_dir_t dir, int m0, int k, const CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle; if ( B->get_rankof( B, Bm, Bn) != A->myrank ) { return; } + if( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_set, zlaswp_set, zlaswp, 3); /* @@ -207,8 +240,7 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, */ starpu_cham_exchange_init_params( options, ¶ms, B->get_rankof( B, Bm, Bn ) ); starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, - RUNTIME_invp_getaddr( ipiv, ipivk ), - STARPU_R ); + ipiv_handle, STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R ); starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( B, ChamComplexDouble, Bm, Bn ), STARPU_RW ); @@ -242,4 +274,5 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, } starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs ); } -#endif +#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */ + diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c index e32b7ad9c46a2303eb1c4c6a18d442935fca6d3a..a479056c5f9321b75cd89a99349fd1ef1c3f3976 100644 --- a/runtime/starpu/codelets/codelet_zperm_allreduce.c +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -12,7 +12,7 @@ * @version 1.3.0 * @author Alycia Lisito * @author Pierre Esterie - * @date 2024-11-14 + * @date 2025-03-24 * @precisions normal z -> c d s * */ @@ -21,6 +21,7 @@ #include <coreblas/cblas_wrapper.h> #if defined(CHAMELEON_USE_MPI) + struct cl_redux_args_s { int tempmm; int mb; @@ -91,6 +92,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, static void INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_desc_t *U, CHAM_ipiv_t *ipiv, int ipivk, @@ -105,6 +107,15 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, int p_first ) { struct cl_redux_args_s *clargs; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + clargs = malloc( sizeof( struct cl_redux_args_s ) ); clargs->tempmm = tempmm; clargs->mb = U->mb; @@ -121,7 +132,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_s), STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, me, n), STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n), - STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_R, ipiv_handle, STARPU_EXECUTE_ON_NODE, me, STARPU_EXECUTE_ON_WORKER, options->workerid, STARPU_PRIORITY, options->priority, @@ -151,6 +162,7 @@ INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, static void INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_desc_t *U, CHAM_ipiv_t *ipiv, int ipivk, @@ -164,8 +176,16 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, int np, int p_first ) { - int ret; + int ret; struct starpu_task *task; + void *ipiv_handle + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zperm_allreduce_send, zperm_allreduce, redux, 3 ); @@ -176,7 +196,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, starpu_cham_exchange_handle_before_execution( options, ¶ms, &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, src, n ), STARPU_R ); - starpu_cham_register_descr( &nbdata, descrs, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R ); + starpu_cham_register_descr( &nbdata, descrs, ipiv_handle, STARPU_R ); task = starpu_task_create(); task->cl = cl; @@ -221,6 +241,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, static void zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -229,10 +250,10 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, int ipivk, int k, int n, - struct chameleon_pzgetrf_s *ws) + struct chameleon_pzgetrf_s *ws ) { int *proc_involved = ws->proc_involved; - int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); + int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k ); int np_iter = np_involved; int p_recv, p_send, me, p_first; int shift = 1; @@ -253,9 +274,11 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; INSERT_TASK_zperm_allreduce_send( options, U, A->myrank, p_send, n ); - INSERT_TASK_zperm_allreduce_recv( options, U, ipiv, ipivk, A->myrank, p_recv, + INSERT_TASK_zperm_allreduce_recv( options, dir, U, ipiv, ipivk, A->myrank, p_recv, n, k == (A->mt-1) ? A->m - k * A->mb : A->mb, - chameleon_desc_datadist_get_iparam(A, 0), chameleon_desc_datadist_get_iparam(A, 1), shift, np_involved, p_first ); + chameleon_desc_datadist_get_iparam(A, 0), + chameleon_desc_datadist_get_iparam(A, 1), + shift, np_involved, p_first ); shift = shift << 1; np_iter = chameleon_ceil( np_iter, 2 ); @@ -265,6 +288,7 @@ zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -280,7 +304,7 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, switch( alg ) { case ChamStarPUTasks: default: - zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); + zperm_allreduce_chameleon_starpu_task( options, dir, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); } } @@ -307,33 +331,51 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, int np, int *proc_involved ) { - int p; + int p; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } for ( p = 0; p < np; p++ ) { if ( proc_involved[ p ] == myrank ) { continue; } starpu_mpi_get_data_on_node_detached( options->sequence->comm, - RUNTIME_perm_getaddr( ipiv, ipivk ), + ipiv_handle, proc_involved[ p ], NULL, NULL ); } } void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, int k, int n ) { - int b, rank; + int b, rank; + void *ipiv_handle; + + if ( dir == ChamDirForward ) { + ipiv_handle = RUNTIME_invp_getaddr( ipiv, ipivk ); + } + else { + ipiv_handle = RUNTIME_perm_getaddr( ipiv, ipivk ); + } for ( b = k+1; (b < A->mt) && ((b-(k+1)) < chameleon_desc_datadist_get_iparam(A, 0)); b ++ ) { rank = A->get_rankof( A, b, n ); @@ -341,10 +383,11 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, continue; } starpu_mpi_get_data_on_node_detached( options->sequence->comm, - RUNTIME_invp_getaddr( ipiv, ipivk ), + ipiv_handle, rank, NULL, NULL ); } } + #else void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, @@ -353,7 +396,7 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, int An, int myrank, int np, - int *proc_involved ) + int *proc_involved ) { (void)options; (void)A; @@ -366,11 +409,12 @@ INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, int myrank, int np, - int *proc_involved ) + int *proc_involved ) { (void)options; (void)ipiv; @@ -382,6 +426,7 @@ INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + cham_dir_t dir, CHAM_ipiv_t *ipiv, int ipivk, const CHAM_desc_t *A, @@ -398,6 +443,7 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + cham_dir_t dir, const CHAM_desc_t *A, CHAM_desc_t *U, int Um, @@ -419,4 +465,5 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, (void)n; (void)ws; } + #endif