diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index afd17c16f2a60d1b5cb35616151072872fdb3de2..892e1405401236a94252ff1d3281d65a571e0880 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -52,6 +52,7 @@ _extra_blas = [ ('', 'sgered', 'dgered', 'cgered', 'zgered' ), ('', 'sgerst', 'dgerst', 'cgerst', 'zgerst' ), ('', 'sipiv_allreduce', 'dipiv_allreduce', 'cipiv_allreduce', 'zipiv_allreduce' ), + ('', 'sperm_allreduce', 'dperm_allreduce', 'cperm_allreduce', 'zperm_allreduce' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 236482682266032654bcc6a8e6050b617134fa98..9b843c60a057a2c5fe2e4e3321b94c02e968fe62 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -583,4 +583,181 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, int h, int n ); +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile + * U(Um, Un) according to the permutation ipiv. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] k + * The number of rows in the tile U(Um, Un). + * + * @param[in] n + * The number of columns in the tile U(Um, Un). + * + * @param[inout] U + * The descriptor of the worskpace used for the permutation in the LU + * factorization with partial pivoting. + * + * @param[in] Um + * The row index of the tile used in U. + * + * @param[in] Un + * The column index of the tile used in U. + * + * @param[in] ws + * The workspace to handle the data in the LU factorization with + * partial pivoting. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus + * involved in the permutation. This task is used in the LU factorization with + * partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] Am + * The row index of the tile used in A. + * + * @param[in] An + * The column index of the tile used in A. + * + * @param[in] myrank + * The rank of the current process. + * + * @param[in] np + * The number of processus involved in the permutation. + * + * @param[in] proc_involved + * The list of the processus involved in the permutation. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the + * processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] myrank + * The rank of the current process. + * + * @param[in] np + * The number of processus involved in the permutation. + * + * @param[in] proc_involved + * The list of the processus involved in the permutation. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk + * to the processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] k + * The index of the panel factorized. + * + * @param[in] n + * The index of the panel to permute. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 6b24081b2bd7f58f330e28b142f8c714ba208009..e46fd45b105edfcf96d1bccb2a6780f481a1a9a7 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -86,6 +86,7 @@ set(CODELETS_ZSRC codelets/codelet_zlaswp_batched.c codelets/codelet_zlatro.c codelets/codelet_zlauum.c + codelets/codelet_zperm_allreduce.c codelets/codelet_zplghe.c codelets/codelet_zplgsy.c codelets/codelet_zplrnt.c diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..c21490d02f42447b07e8516842b2b9b840850006 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -0,0 +1,173 @@ +/** + * + * @file starpu/codelet_zperm_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu_internal.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +#if defined(CHAMELEON_USE_MPI) +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; +} + +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + int p, rank; + + for ( p = 0; p < np; p ++ ) { + if ( proc_involved[ p ] == myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + proc_involved[ p ], NULL, NULL ); + } +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + int p; + + for ( p = 0; p < np; p++ ) { + if ( proc_involved[ p ] == myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RUNTIME_perm_getaddr( ipiv, ipivk ), + proc_involved[ p ], NULL, NULL ); + } +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + int b, rank; + + for ( b = k+1; (b < A->mt) && ((b-(k+1)) < A->p); b ++ ) { + rank = A->get_rankof( A, b, n ); + if ( rank == A->myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RUNTIME_invp_getaddr( ipiv, ipivk ), + rank, NULL, NULL ); + } +} +#else +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)A; + (void)k; + (void)n; +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + (void)options; + (void)A; + (void)ipiv; + (void)ipivk; + (void)k; + (void)n; + (void)U; + (void)Um; + (void)Un; + (void)ws; +} +#endif