diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index afd17c16f2a60d1b5cb35616151072872fdb3de2..892e1405401236a94252ff1d3281d65a571e0880 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -52,6 +52,7 @@ _extra_blas = [ ('', 'sgered', 'dgered', 'cgered', 'zgered' ), ('', 'sgerst', 'dgerst', 'cgerst', 'zgerst' ), ('', 'sipiv_allreduce', 'dipiv_allreduce', 'cipiv_allreduce', 'zipiv_allreduce' ), + ('', 'sperm_allreduce', 'dperm_allreduce', 'cperm_allreduce', 'zperm_allreduce' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index d95b415d9bbe5473f38be0d3b304c7c3c898adf1..5b63d1e010f2d12dab9eef167f12cf94b8e0e4a4 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -26,6 +26,7 @@ #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n #define Up(m,n) &(ws->Up), m, n +#define Wu(m,n) &(ws->Wu), m, n /* * All the functions below are panel factorization variant. @@ -214,10 +215,6 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, int m, h, b, nbblock; int tempkm, tempkn, tempmm, minmn; - if ( ! ws->involved ) { - return; - } - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; minmn = chameleon_min( tempkm, tempkn ); @@ -340,25 +337,10 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, int k, RUNTIME_option_t *options ) { -#if defined ( CHAMELEON_USE_MPI ) - int *proc_involved = malloc( sizeof( int ) * chameleon_min( A->p, A->mt - k) ); - int b; - - /* 2DBC only */ - ws->involved = 0; - for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) { - int rank = chameleon_getrankof_2d( A, b, k ); - proc_involved[ b-k ] = rank; - if ( rank == A->myrank ) { - ws->involved = 1; - } - } - ws->proc_involved = proc_involved; - if ( ws->involved == 0 ) { - free( proc_involved ); + chameleon_get_proc_involved_in_panelk_2dbc( A, k, k, ws ); + if ( !ws->involved ) { return; } -#endif /* TODO: Should be replaced by a function pointer */ switch( ws->alg ) { @@ -388,9 +370,6 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, default: chameleon_pzgetrf_panel_facto_nopiv( ws, A, ipiv, k, options ); } -#if defined ( CHAMELEON_USE_MPI ) - free( proc_involved ); -#endif } /** @@ -411,6 +390,19 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, int m; int tempkm, tempkn, tempnn, minmn; + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -418,28 +410,26 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, /* Extract selected rows into U */ INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - A(k, n), U(k, n) ); + A(k, n), Wu(A->myrank, n) ); /* * perm array is made of size tempkm for the first row especially. * Otherwise, the final copy back to the tile may copy only a partial tile */ INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, - ipiv, k, A(k, n), U(k, n) ); + ipiv, k, A(k, n), Wu(A->myrank, n) ); for(m=k+1; m<A->mt; m++){ /* Extract selected rows into A(k, n) */ INSERT_TASK_zlaswp_get( options, m*A->mb, minmn, - ipiv, k, A(m, n), U(k, n) ); + ipiv, k, A(m, n), Wu(A->myrank, n) ); /* Copy rows from A(k,n) into their final position */ INSERT_TASK_zlaswp_set( options, m*A->mb, minmn, ipiv, k, A(k, n), A(m, n) ); } - INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - U(k, n), A(k, n) ); - - RUNTIME_data_flush( options->sequence, U(k, n) ); + INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, + Wu(A->myrank, n), ws ); } break; default: @@ -462,6 +452,20 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, { int m; int tempkm, tempkn, tempnn, minmn; + + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } + void **clargs = malloc( sizeof(char *) ); *clargs = NULL; @@ -472,25 +476,23 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, /* Extract selected rows into U */ INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - A(k, n), U(k, n) ); + A(k, n), Wu(A->myrank, n) ); /* * perm array is made of size tempkm for the first row especially. * Otherwise, the final copy back to the tile may copy only a partial tile */ INSERT_TASK_zlaswp_get( options, k*A->mb, tempkm, - ipiv, k, A(k, n), U(k, n) ); + ipiv, k, A(k, n), Wu(A->myrank, n) ); for(m=k+1; m<A->mt; m++){ - INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, k, m, n, (void *)ws, - ipiv, k, A, &(ws->U), clargs ); + INSERT_TASK_zlaswp_batched( options, m*A->mb, minmn, (void *)ws, ipiv, k, + A(m, n), A(k, n), Wu(A->myrank, n), clargs ); } - INSERT_TASK_zlaswp_batched_flush( options, k, n, ipiv, k, A, &(ws->U), clargs ); + INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); - INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, - U(k, n), A(k, n) ); + INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, Wu(A->myrank, n), ws ); - RUNTIME_data_flush( options->sequence, U(k, n) ); free( clargs ); } break; @@ -510,7 +512,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t) 1.0; const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0; - int m, tempkm, tempmm, tempnn; + int m, tempkm, tempmm, tempnn, rankAmn, p; tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -522,25 +524,44 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); } - INSERT_TASK_ztrsm( - options, - ChamLeft, ChamLower, ChamNoTrans, ChamUnit, - tempkm, tempnn, A->mb, - zone, A(k, k), - A(k, n) ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + for ( p = 0; p < ws->np_involved; p++ ) { + INSERT_TASK_ztrsm( + options, + ChamLeft, ChamLower, ChamNoTrans, ChamUnit, + tempkm, tempnn, A->mb, + zone, A(k, k), + Wu(ws->proc_involved[p], n) ); + } + } + else if ( ws->involved ) { + INSERT_TASK_ztrsm( + options, + ChamLeft, ChamLower, ChamNoTrans, ChamUnit, + tempkm, tempnn, A->mb, + zone, A(k, k), + Wu(A->myrank, n) ); + } for (m = k+1; m < A->mt; m++) { tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + rankAmn = A->get_rankof( A, m, n ); INSERT_TASK_zgemm( options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, A->mb, A->mb, mzone, A(m, k), - A(k, n), + Wu(rankAmn, n), zone, A(m, n) ); } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + } + + RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) ); RUNTIME_data_flush( options->sequence, A(k, n) ); } @@ -556,7 +577,7 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, CHAM_context_t *chamctxt; RUNTIME_option_t options; - int k, m, n; + int k, m, n, tempkm, tempnn; int min_mnt = chameleon_min( A->mt, A->nt ); chamctxt = chameleon_context_self(); @@ -581,7 +602,11 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, for (n = k+1; n < A->nt; n++) { options.priority = A->nt-n; - chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options ); + if ( chameleon_involved_in_panelk_2dbc( A, k ) || + chameleon_involved_in_panelk_2dbc( A, n ) ) + { + chameleon_pzgetrf_panel_update( ws, A, IPIV, k, n, &options ); + } } /* Flush panel k */ @@ -596,7 +621,19 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, if ( ws->batch_size > 0 ) { for (k = 1; k < min_mnt; k++) { for (n = 0; n < k; n++) { - chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options ); + if ( chameleon_involved_in_panelk_2dbc( A, k ) || + chameleon_involved_in_panelk_2dbc( A, n ) ) + { + chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options ); + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + RUNTIME_data_flush( sequence, A(k, n) ); + } + } + RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); } RUNTIME_perm_flushk( sequence, IPIV, k ); } @@ -604,7 +641,19 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, else { for (k = 1; k < min_mnt; k++) { for (n = 0; n < k; n++) { - chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); + if ( chameleon_involved_in_panelk_2dbc( A, k ) || + chameleon_involved_in_panelk_2dbc( A, n ) ) + { + chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + RUNTIME_data_flush( sequence, A(k, n) ); + } + } + RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); } RUNTIME_perm_flushk( sequence, IPIV, k ); } diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 8fb6734d3e15fe2cc25fb9c1664db8bc9a0f6987..b7e8f87b622c35d68f557f9c59393eabc017c679 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -67,6 +67,12 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) ws->alg = ChamGetrfPPiv; ws->ib = CHAMELEON_IB; +#if defined (CHAMELEON_USE_MPI) + ws->proc_involved = malloc( sizeof( int ) * A->p ); + ws->involved = 0; + ws->np_involved = 0; +#endif + { char *algostr = chameleon_getenv( "CHAMELEON_GETRF_ALGO" ); @@ -112,6 +118,11 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) A->m, A->n, 0, 0, A->m, A->n, A->p, A->q, NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); + chameleon_desc_init( &(ws->Wu), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, A->mb, A->nb, A->mb*A->nb, + A->mb * A->p * A->q, A->n, 0, 0, + A->mb * A->p * A->q, A->n, A->p * A->q, 1, + NULL, NULL, NULL, A->get_rankof_init_arg ); } /* Set ib to 1 if per column algorithm */ @@ -160,6 +171,10 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) { struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)user_ws; +#if defined (CHAMELEON_USE_MPI) + free( ws->proc_involved ); +#endif + if ( ( ws->alg == ChamGetrfNoPivPerColumn ) || ( ws->alg == ChamGetrfPPiv ) || ( ws->alg == ChamGetrfPPivPerColumn ) ) @@ -170,6 +185,11 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) { chameleon_desc_destroy( &(ws->Up) ); } + if ( ( ws->alg == ChamGetrfPPiv ) || + ( ws->alg == ChamGetrfPPivPerColumn ) ) + { + chameleon_desc_destroy( &(ws->Wu) ); + } free( ws ); } diff --git a/control/compute_z.h b/control/compute_z.h index 65d580ad9b0b10e40bd8ffdbc296fe833aa590fb..acb9599f295c77774acccef98c88d7db22c59362 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -43,13 +43,15 @@ struct chameleon_pzgemm_s { * @brief Data structure to handle the GETRF workspaces with partial pivoting */ struct chameleon_pzgetrf_s { - cham_getrf_t alg; - int ib; /**< Internal blocking parameter */ - int batch_size; /**< Batch size for the panel */ - CHAM_desc_t U; - CHAM_desc_t Up; - int *proc_involved; - unsigned int involved:1; + cham_getrf_t alg; + int ib; /**< Internal blocking parameter */ + int batch_size; /**< Batch size for the panel */ + CHAM_desc_t U; + CHAM_desc_t Up; /**< Workspace used for the panel factorization */ + CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ + int *proc_involved; + unsigned int involved; + int np_involved; }; /** diff --git a/control/descriptor_helpers.c b/control/descriptor_helpers.c index 9cae1883552fc8f418aca49140cf904dbcdcbed8..b49cb69e9b751e4494ae01de14571010c64e980c 100644 --- a/control/descriptor_helpers.c +++ b/control/descriptor_helpers.c @@ -100,6 +100,52 @@ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k ) { return ( myrank % A->q == k % A->q ); } +/** + * @brief Test if the current MPI process is involved in the panel k for 2DBC distributions. + * + * @param[in] A + * The matrix descriptor. + * + * @param[in] k + * The index of the panel to test. + * + * @param[in] n + * The index of the panel to test. + * + * @param[inout] ws_getrf + * The i. + * + */ +void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A, + int k, + int n, + void *ws_getrf ) +{ +#if defined (CHAMELEON_USE_MPI) + struct chameleon_pzgetrf_s *ws = (struct chameleon_pzgetrf_s *)ws_getrf; + int *proc_involved = ws->proc_involved; + int b, rank, np; + + np = 0; + ws->involved = 0; + for ( b = k; (b < A->mt) && ((b-k) < A->p); b ++ ) { + rank = chameleon_getrankof_2d( A, b, n ); + proc_involved[ b-k ] = rank; + np ++; + if ( rank == A->myrank ) { + ws->involved = 1; + } + } + ws->proc_involved = proc_involved; + ws->np_involved = np; +#else + (void)A; + (void)k; + (void)n; + (void)ws_getrf; +#endif +} + /** * @brief Initializes a custom distribution based on an external file. * diff --git a/include/chameleon/descriptor_helpers.h b/include/chameleon/descriptor_helpers.h index da79d04863f4180e6c6ce929fee6b33235998fc3..7bfdeb77ba565c2da0f9668f5d7d347e6e44112c 100644 --- a/include/chameleon/descriptor_helpers.h +++ b/include/chameleon/descriptor_helpers.h @@ -64,6 +64,10 @@ int chameleon_getrankof_custom ( const CHAM_desc_t *A, int m, int n ); */ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int An ); +void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A, + int k, + int n, + void *ws_getrf ); /** * @} diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 236482682266032654bcc6a8e6050b617134fa98..5f1bbcd322293e3104fca313974096bcf711de71 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -199,17 +199,17 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, const CHAM_desc_t *tileA, int tileAm, int tileAn, const CHAM_desc_t *tileB, int tileBm, int tileBn ); void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, - int m0, int minmn, int k, int m, int n, + int m0, int minmn, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *U, + const CHAM_desc_t *Am, int Amm, int Amn, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ); void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - int k, int n, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *U, + const CHAM_desc_t *Ak, int Akm, int Akn, + const CHAM_desc_t *U, int Um, int Un, void **clargs_ptr ); void INSERT_TASK_zlatro( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb, @@ -583,4 +583,181 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, int h, int n ); +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce - Perfoms an allreduce operation on the tile + * U(Um, Un) according to the permutation ipiv. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] k + * The number of rows in the tile U(Um, Un). + * + * @param[in] n + * The number of columns in the tile U(Um, Un). + * + * @param[inout] U + * The descriptor of the worskpace used for the permutation in the LU + * factorization with partial pivoting. + * + * @param[in] Um + * The row index of the tile used in U. + * + * @param[in] Un + * The column index of the tile used in U. + * + * @param[in] ws + * The workspace to handle the data in the LU factorization with + * partial pivoting. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_A - Sends the tile A(Am, An) to the processus + * involved in the permutation. This task is used in the LU factorization with + * partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] Am + * The row index of the tile used in A. + * + * @param[in] An + * The column index of the tile used in A. + * + * @param[in] myrank + * The rank of the current process. + * + * @param[in] np + * The number of processus involved in the permutation. + * + * @param[in] proc_involved + * The list of the processus involved in the permutation. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_perm - Sends the permutation ipivk to the + * processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] myrank + * The rank of the current process. + * + * @param[in] np + * The number of processus involved in the permutation. + * + * @param[in] proc_involved + * The list of the processus involved in the permutation. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ); + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * INSERT_TASK_zperm_allreduce_send_invp - Sends the inverse permutation ipivk + * to the processus involved in the permutation. This task is used in the LU + * factorization with partial pivoting. + * + ******************************************************************************* + * + * @param[in] options + * The runtime options data structure to pass through all insert_task calls. + * + * @param[in] ipiv + * The pivot structure that contains the informations for the LU + * factorization with partial pivoting. + * + * @param[in] ipivk + * The index of the permutation. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] k + * The index of the panel factorized. + * + * @param[in] n + * The index of the panel to permute. + * + ******************************************************************************* + */ +void INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ); + #endif /* _chameleon_tasks_z_h_ */ diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 6b24081b2bd7f58f330e28b142f8c714ba208009..e46fd45b105edfcf96d1bccb2a6780f481a1a9a7 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -86,6 +86,7 @@ set(CODELETS_ZSRC codelets/codelet_zlaswp_batched.c codelets/codelet_zlatro.c codelets/codelet_zlauum.c + codelets/codelet_zperm_allreduce.c codelets/codelet_zplghe.c codelets/codelet_zplgsy.c codelets/codelet_zplrnt.c diff --git a/runtime/openmp/codelets/codelet_zlaswp_batched.c b/runtime/openmp/codelets/codelet_zlaswp_batched.c index 49ac5381ca1d1e4fe3bfb562811675b3d909765b..07fd1eab85abeb6913936ef7980fec919fb03443 100644 --- a/runtime/openmp/codelets/codelet_zlaswp_batched.c +++ b/runtime/openmp/codelets/codelet_zlaswp_batched.c @@ -21,45 +21,57 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, int m0, int minmn, - int k, - int m, - int n, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *Wu, + const CHAM_desc_t *Am, + int Amm, + int Amn, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; (void)m0; (void)minmn; - (void)k; - (void)m; - (void)n; (void)ws; (void)ipiv; (void)ipivk; - (void)A; - (void)Wu; + (void)Am; + (void)Amm; + (void)Amn; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - int k, - int n, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, + const CHAM_desc_t *Ak, + int Akm, + int Akn, const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; - (void)k; - (void)n; (void)ipiv; (void)ipivk; - (void)A; + (void)Ak; + (void)Akm; + (void)Akn; (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..cb77c806bcb8ce47a62e7b4e19b2dad3dafc8218 --- /dev/null +++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c @@ -0,0 +1,93 @@ +/** + * + * @file openmp/codelet_zperm_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon openmp codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_openmp.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)A; + (void)k; + (void)n; +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + (void)options; + (void)A; + (void)ipiv; + (void)ipivk; + (void)k; + (void)n; + (void)U; + (void)Um; + (void)Un; + (void)ws; +} diff --git a/runtime/parsec/codelets/codelet_zlaswp_batched.c b/runtime/parsec/codelets/codelet_zlaswp_batched.c index aa8726690b25d23b6cdd3ea6ff525b9c36be12d3..011d42e8b2359ba7ffbfb9a8022b2c18c9b8e8e0 100644 --- a/runtime/parsec/codelets/codelet_zlaswp_batched.c +++ b/runtime/parsec/codelets/codelet_zlaswp_batched.c @@ -21,45 +21,57 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, int m0, int minmn, - int k, - int m, - int n, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *Wu, + const CHAM_desc_t *Am, + int Amm, + int Amn, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; (void)m0; (void)minmn; - (void)k; - (void)m; - (void)n; (void)ws; (void)ipiv; (void)ipivk; - (void)A; - (void)Wu; + (void)Am; + (void)Amm; + (void)Amn; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - int k, - int n, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, + const CHAM_desc_t *Ak, + int Akm, + int Akn, const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; - (void)k; - (void)n; (void)ipiv; (void)ipivk; - (void)A; + (void)Ak; + (void)Akm; + (void)Akn; (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..30890f8114b857b7c12804c526f4aa4c875b63a1 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c @@ -0,0 +1,93 @@ +/** + * + * @file parsec/codelet_zperm_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon parsec codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_parsec.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)A; + (void)k; + (void)n; +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + (void)options; + (void)A; + (void)ipiv; + (void)ipivk; + (void)k; + (void)n; + (void)U; + (void)Um; + (void)Un; + (void)ws; +} diff --git a/runtime/quark/codelets/codelet_zlaswp_batched.c b/runtime/quark/codelets/codelet_zlaswp_batched.c index f96414f27d29f448b7856d1e913e42cc4e15fcff..9ec2148fbe51cbf9cd168c033ac673add97141a2 100644 --- a/runtime/quark/codelets/codelet_zlaswp_batched.c +++ b/runtime/quark/codelets/codelet_zlaswp_batched.c @@ -21,45 +21,57 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, int m0, int minmn, - int k, - int m, - int n, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *Wu, + const CHAM_desc_t *Am, + int Amm, + int Amn, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; (void)m0; (void)minmn; - (void)k; - (void)m; - (void)n; (void)ws; (void)ipiv; (void)ipivk; - (void)A; - (void)Wu; + (void)Am; + (void)Amm; + (void)Amn; + (void)Ak; + (void)Akm; + (void)Akn; + (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - int k, - int n, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, + const CHAM_desc_t *Ak, + int Akm, + int Akn, const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { (void)options; - (void)k; - (void)n; (void)ipiv; (void)ipivk; - (void)A; + (void)Ak; + (void)Akm; + (void)Akn; (void)U; + (void)Um; + (void)Un; (void)clargs_ptr; } diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..52281451dd038a9276a2040b9f4c08f7effa63f7 --- /dev/null +++ b/runtime/quark/codelets/codelet_zperm_allreduce.c @@ -0,0 +1,93 @@ +/** + * + * @file quark/codelet_zperm_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon quark codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_quark.h" +#include "chameleon/tasks_z.h" + +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)A; + (void)k; + (void)n; +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + (void)options; + (void)A; + (void)ipiv; + (void)ipivk; + (void)k; + (void)n; + (void)U; + (void)Um; + (void)Un; + (void)ws; +} diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c index ade365c68ff52757a11b9c8077d14ce28e7208d0..96d3108a89b74e67fb31d892b974b0d2d1d7e3a7 100644 --- a/runtime/starpu/codelets/codelet_zlaswp.c +++ b/runtime/starpu/codelets/codelet_zlaswp.c @@ -47,6 +47,9 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options, const CHAM_desc_t *U, int Um, int Un ) { struct starpu_codelet *codelet = &cl_zlaswp_get; + if ( A->get_rankof( A, Am, An) != A->myrank ) { + return; + } //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL; @@ -91,6 +94,9 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options, const CHAM_desc_t *B, int Bm, int Bn ) { struct starpu_codelet *codelet = &cl_zlaswp_set; + if ( A->get_rankof( B, Bm, Bn) != A->myrank ) { + return; + } //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL; diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c index 6af43659c2768c2443684de411297fab9a68e003..b17f26a486dc87e5d8dcb807369bfa431e809b06 100644 --- a/runtime/starpu/codelets/codelet_zlaswp_batched.c +++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c @@ -57,21 +57,25 @@ CODELETS_CPU( zlaswp_batched, cl_zlaswp_batched_cpu_func ) void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, int m0, int minmn, - int k, - int m, - int n, void *ws, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, - const CHAM_desc_t *Wu, + const CHAM_desc_t *Am, + int Amm, + int Amn, + const CHAM_desc_t *Ak, + int Akm, + int Akn, + const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { int task_num = 0; int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; int nhandles; struct cl_laswp_batched_args_t *clargs = *clargs_ptr; - if ( A->get_rankof( A, m, n) != A->myrank ) { + if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) { return; } @@ -84,7 +88,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, task_num = clargs->tasks_nbr; clargs->m0[ task_num ] = m0; - clargs->handle_mode[ task_num ].handle = RTBLKADDR(A, CHAMELEON_Complex64_t, m, n); + clargs->handle_mode[ task_num ].handle = RTBLKADDR(Am, CHAMELEON_Complex64_t, Amm, Amn); clargs->handle_mode[ task_num ].mode = STARPU_RW; clargs->tasks_nbr ++; @@ -95,8 +99,8 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_laswp_batched_args_t), STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), - STARPU_RW | STARPU_COMMUTE, RTBLKADDR(Wu, ChamComplexDouble, A->myrank, n), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, k, n), + STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), + STARPU_R, RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, nhandles, STARPU_PRIORITY, options->priority, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -108,12 +112,14 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, } void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, - int k, - int n, const CHAM_ipiv_t *ipiv, int ipivk, - const CHAM_desc_t *A, + const CHAM_desc_t *Ak, + int Akm, + int Akn, const CHAM_desc_t *U, + int Um, + int Un, void **clargs_ptr ) { struct cl_laswp_batched_args_t *clargs = *clargs_ptr; @@ -129,8 +135,8 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_laswp_batched_args_t), STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R, RUNTIME_invp_getaddr( ipiv, ipivk ), - STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, k, n), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, k, n), + STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un), + STARPU_R, RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn), STARPU_DATA_MODE_ARRAY, clargs->handle_mode, nhandles, STARPU_PRIORITY, options->priority, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c new file mode 100644 index 0000000000000000000000000000000000000000..ab9cf702294f7a54348b8d7995f45aca5afc32e3 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -0,0 +1,307 @@ +/** + * + * @file starpu/codelet_zperm_allreduce.c + * + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon StarPU codelets to do the reduction + * + * @version 1.3.0 + * @author Alycia Lisito + * @date 2024-06-11 + * @precisions normal z -> c d s + * + */ +#include "chameleon_starpu_internal.h" +#include "runtime_codelet_z.h" +#include <coreblas/cblas_wrapper.h> + +#if defined(CHAMELEON_USE_MPI) +struct cl_redux_args_t { + int tempmm; + int n; + int p; + int q; + int p_first; + int me; + int shift; + int np_inv; +}; + +static void +cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + const CHAM_tile_t *tileUinout = cti_interface_get( descr[0] ); + const CHAM_tile_t *tileUin = cti_interface_get( descr[1] ); + const int *perm = (int *)STARPU_VECTOR_GET_PTR( descr[2] ); + CHAMELEON_Complex64_t *Uinout = CHAM_tile_get_ptr( tileUinout ); + const CHAMELEON_Complex64_t *Uin = CHAM_tile_get_ptr( tileUin ); + + int tempmm = clargs->tempmm; + int n = clargs->n; + int p = clargs->p; + int q = clargs->q; + int p_first = clargs->p_first / q; + int shift = clargs->shift; + int np = clargs->np_inv; + int me = ( p <= np ) ? clargs->me / q : ( ( clargs->me / q ) - p_first + p ) % p; + int nb = tileUinout->n; + int mb = tileUinout->m; + int first = me - 2 * shift + 1; + int last = me - shift; + int i, m, ownerp; + + for ( i = 0; i < tempmm; i++ ) { + m = perm[ i ] / mb; + ownerp = ( p <= np ) ? ( (m % p) * q + (n % q) ) / q : ( ( (m % p) * q + (n % q) ) / q - p_first + p ) % p; + + if ( ( (first <= ownerp) && (ownerp <= last ) ) || + ( (first+np <= ownerp) && (ownerp <= last+np) ) ) + { + cblas_zcopy( nb, Uin + i, tileUin->ld, + Uinout + i, tileUinout->ld ); + } + } +} + +CODELETS_CPU( zperm_allreduce, cl_zperm_allreduce_cpu_func ) + +static void +INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options, + CHAM_desc_t *U, + int me, + int dst, + int n ) +{ + rt_starpu_insert_task( + NULL, + STARPU_EXECUTE_ON_NODE, dst, + STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, me, n), + STARPU_PRIORITY, options->priority, + 0 ); +} + +static void +INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, + CHAM_desc_t *U, + CHAM_ipiv_t *ipiv, + int ipivk, + int me, + int src, + int n, + int tempmm, + int p, + int q, + int shift, + int np, + int p_first ) +{ + struct cl_redux_args_t *clargs; + clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs->tempmm = tempmm; + clargs->n = n; + clargs->p = p; + clargs->q = q; + clargs->p_first = p_first; + clargs->me = me; + clargs->shift = shift; + clargs->np_inv = np; + + rt_starpu_insert_task( + &cl_zperm_allreduce, + STARPU_CL_ARGS, clargs, sizeof(struct cl_redux_args_t), + STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, me, n), + STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n), + STARPU_R, RUNTIME_perm_getaddr( ipiv, ipivk ), + STARPU_EXECUTE_ON_NODE, me, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_PRIORITY, options->priority, + 0 ); + starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) ); +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + int *proc_involved = tmp->proc_involved; + int np_involved = chameleon_min( A->p, A->mt - k); + int np_iter = np_involved; + int p_recv, p_send, me, p_first; + int shift = 1; + + if ( np_involved == 1 ) { + assert( proc_involved[0] == A->myrank ); + } + else { + p_first = proc_involved[0]; + for( me = 0; me < np_involved; me++ ) { + if ( proc_involved[me] == A->myrank ) { + break; + } + } + assert( me < np_involved ); + while ( np_iter > 1 ) { + p_send = proc_involved[ ( me + shift ) % np_involved ]; + p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; + + INSERT_TASK_zperm_allreduce_send( options, U, A->myrank, p_send, n ); + INSERT_TASK_zperm_allreduce_recv( options, U, ipiv, ipivk, A->myrank, p_recv, + n, k == (A->mt-1) ? A->m - k * A->mb : A->mb, + A->p, A->q, shift, np_involved, p_first ); + + shift = shift << 1; + np_iter = chameleon_ceil( np_iter, 2 ); + } + } +} + +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + int p, rank; + + for ( p = 0; p < np; p ++ ) { + if ( proc_involved[ p ] == myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + proc_involved[ p ], NULL, NULL ); + } +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + int p; + + for ( p = 0; p < np; p++ ) { + if ( proc_involved[ p ] == myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RUNTIME_perm_getaddr( ipiv, ipivk ), + proc_involved[ p ], NULL, NULL ); + } +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + int b, rank; + + for ( b = k+1; (b < A->mt) && ((b-(k+1)) < A->p); b ++ ) { + rank = A->get_rankof( A, b, n ); + if ( rank == A->myrank ) { + continue; + } + starpu_mpi_get_data_on_node_detached( options->sequence->comm, + RUNTIME_invp_getaddr( ipiv, ipivk ), + rank, NULL, NULL ); + } +} +#else +void +INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, + CHAM_desc_t *A, + int Am, + int An, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)A; + (void)Am; + (void)An; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_perm( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + int myrank, + int np, + int *proc_involved ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)myrank; + (void)np; + (void)proc_involved; +} + +void +INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int ipivk, + const CHAM_desc_t *A, + int k, + int n ) +{ + (void)options; + (void)ipiv; + (void)ipivk; + (void)A; + (void)k; + (void)n; +} + +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + CHAM_desc_t *U, + int Um, + int Un, + void *ws ) +{ + (void)options; + (void)A; + (void)ipiv; + (void)ipivk; + (void)k; + (void)n; + (void)U; + (void)Um; + (void)Un; + (void)ws; +} +#endif