diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 6db9a8a40148a67fcaeda74f9a718c949b12e59d..635bbbb84a3436564d04476f4e366f9ac5edb10b 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -150,7 +150,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, } /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } /* Flush temporary data used for the pivoting */ @@ -196,7 +196,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } free( clargs ); @@ -250,7 +250,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) { INSERT_TASK_zgetrf_blocked_trsm( @@ -312,7 +312,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( (b < (nbblock-1)) && (h == hmax-1) ) { INSERT_TASK_zgetrf_blocked_trsm( @@ -338,10 +338,12 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, int k, RUNTIME_option_t *options ) { +#if defined(CHAMELEON_USE_MPI) chameleon_get_proc_involved_in_panelk_2dbc( A, k, k, ws ); if ( !ws->involved ) { return; } +#endif /* TODO: Should be replaced by a function pointer */ switch( ws->alg ) { @@ -350,7 +352,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPivPerColumn: - if ( ws->batch_size > 0 ) { + if ( ws->batch_size_blas2 > 0 ) { chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options ); } else { @@ -359,7 +361,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPiv: - if ( ws->batch_size > 0 ) { + if ( ws->batch_size_blas2 > 0 ) { chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options ); } else { @@ -392,19 +394,6 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, int tempkm, tempkn, tempnn, minmn; int withlacpy; - chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); - } - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); - } - - if ( !ws->involved ) { - return; - } - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempkn = A->get_blkdim( A, k, DIM_n, A->n ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); @@ -433,8 +422,7 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, ipiv, k, A(k, n), A(m, n) ); } - INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, - Wu(A->myrank, n), ws ); + INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); } break; default: @@ -458,19 +446,6 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, int tempkm, tempkn, tempnn, minmn; int withlacpy; - chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); - } - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); - } - - if ( !ws->involved ) { - return; - } - void **clargs = malloc( sizeof(char *) ); *clargs = NULL; @@ -499,7 +474,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); - INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, Wu(A->myrank, n), ws ); + INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); free( clargs ); } @@ -509,6 +484,80 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } } +static inline void +chameleon_pzgetrf_panel_permute_forward( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options ) +{ +#if defined(CHAMELEON_USE_MPI) + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } +#endif + + if ( ws->batch_size_swap > 0 ) { + chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); + } + else { + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + } +} + +static inline void +chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options, + RUNTIME_sequence_t *sequence ) +{ + int tempkm, tempnn; + +#if defined(CHAMELEON_USE_MPI) + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } +#endif + + if ( ws->batch_size_swap > 0 ) { + chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); + } + else { + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + } + + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + tempnn = A->get_blkdim( A, n, DIM_n, A->n ); + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + RUNTIME_data_flush( sequence, A(k, n) ); + } +} + static inline void chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -516,7 +565,7 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, RUNTIME_option_t *options ) { CHAM_context_t *chamctxt = chameleon_context_self(); - int m, tempmm, tempkn, q; + int m, n, tempmm, tempkn, tempkm, p, q, involved, np; int lookahead = chamctxt->lookahead; int P = chameleon_desc_datadist_get_iparam(A, 0); int Q = chameleon_desc_datadist_get_iparam(A, 1); @@ -561,6 +610,44 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, RUNTIME_data_flush( options->sequence, A(m, k) ); } } + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + np = chameleon_desc_datadist_get_iparam(A, 1) * chameleon_desc_datadist_get_iparam(A, 0); +#if defined(CHAMELEON_USE_MPI) + /* Send Akk for replicated trsm */ + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + for ( p = 0; p < np; p++ ) { + involved = 0; + for ( n = k+1; n < A->nt; n++ ) { + if ( chameleon_p_involved_in_panelk_2dbc( A, n, p ) ) { + involved = 1; + break; + } + } + if ( involved ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(p, k) ); + } + } + } + else { + involved = 0; + for ( n = k+1; n < A->nt; n++ ) { + if ( chameleon_involved_in_panelk_2dbc( A, n ) ) { + involved = 1; + break; + } + } + if ( involved ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(A->myrank, k) ); + } + } +#else + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(A->myrank, k) ); +#endif + RUNTIME_data_flush( options->sequence, A(k, k) ); } static inline void @@ -584,30 +671,17 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - if ( ws->batch_size > 0 ) { - chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); - } - else { - chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); - } + chameleon_pzgetrf_panel_permute_forward( ws, A, ipiv, k, n, options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - for ( p = 0; p < ws->np_involved; p++ ) { - INSERT_TASK_ztrsm( - options, - ChamLeft, ChamLower, ChamNoTrans, ChamUnit, - tempkm, tempnn, A->mb, - zone, A(k, k), - Wu(ws->proc_involved[p], n) ); - RUNTIME_data_flush( options->sequence, Wu(ws->proc_involved[p], n) ); - } - } - else if ( ws->involved ) { +#if defined(CHAMELEON_USE_MPI) + if ( ws->involved ) +#endif + { INSERT_TASK_ztrsm( options, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, tempkm, tempnn, A->mb, - zone, A(k, k), + zone, Wu(A->myrank, k), Wu(A->myrank, n) ); } @@ -632,7 +706,6 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, } RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) ); - RUNTIME_data_flush( options->sequence, A(k, k) ); RUNTIME_data_flush( options->sequence, A(k, n) ); } @@ -683,54 +756,26 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } /* Flush panel k */ - for (m = k; m < A->mt; m++) { + for (m = k+1; m < A->mt; m++) { RUNTIME_data_flush( sequence, A(m, k) ); } + RUNTIME_data_flush( sequence, Wu(A->myrank, k) ); RUNTIME_iteration_pop( chamctxt ); } CHAMELEON_Desc_Flush( &(ws->Wl), sequence ); /* Backward pivoting */ - if ( ws->batch_size > 0 ) { - for (k = 1; k < min_mnt; k++) { - for (n = 0; n < k; n++) { - if ( chameleon_involved_in_panelk_2dbc( A, k ) || - chameleon_involved_in_panelk_2dbc( A, n ) ) - { - chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); - tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, - Wu(A->myrank, n), A(k, n) ); - RUNTIME_data_flush( sequence, A(k, n) ); - } - } - RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); - } - RUNTIME_perm_flushk( sequence, IPIV, k ); - } - } - else { - for (k = 1; k < min_mnt; k++) { - for (n = 0; n < k; n++) { - if ( chameleon_involved_in_panelk_2dbc( A, k ) || - chameleon_involved_in_panelk_2dbc( A, n ) ) - { - chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); - tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, - Wu(A->myrank, n), A(k, n) ); - RUNTIME_data_flush( sequence, A(k, n) ); - } - } - RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); + for (k = 1; k < min_mnt; k++) { + for (n = 0; n < k; n++) { + if ( chameleon_involved_in_panelk_2dbc( A, k ) || + chameleon_involved_in_panelk_2dbc( A, n ) ) + { + chameleon_pzgetrf_panel_permute_backward( ws, A, IPIV, k, n, &options, sequence ); } - RUNTIME_perm_flushk( sequence, IPIV, k ); + RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); } + RUNTIME_perm_flushk( sequence, IPIV, k ); } CHAMELEON_Desc_Flush( &(ws->Wu), sequence ); diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 514e89d3e375a38d487efcddf6aee07505660f00..254020a55c478dcb6982d5b002fff2d5e69c9902 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -98,10 +98,35 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( algostr ); } - ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); - if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); - ws->batch_size = CHAMELEON_BATCH_SIZE; + { + char *allreduce = chameleon_getenv( "CHAMELEON_GETRF_ALL_REDUCE" ); + + if ( allreduce != NULL ) { + if ( strcasecmp( allreduce, "cham_spu_tasks" ) == 0 ) { + ws->alg_allreduce = ChamStarPUTasks; + } + else { + chameleon_error( "CHAMELEON_zgetrf_WS_Alloc", "CHAMELEON_GETRF_ALL_REDUCE is not one of chameleon_starpu_tasks, chameleon_starpu, chameleon_starpu_mpi, chameleon_mpi => Switch back to chameleon_starpu_tasks\n" ); + ws->alg_allreduce = ChamStarPUTasks; + } + } + chameleon_cleanenv( allreduce ); + } + + ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", 0 ); + if ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS2 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS2 value\n" ); + ws->batch_size_blas2 = CHAMELEON_BATCH_SIZE; + } + ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", 0 ); + if ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS3 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS3 value\n" ); + ws->batch_size_blas3 = CHAMELEON_BATCH_SIZE; + } + ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", 0 ); + if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_SWAP must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_SWAP value\n" ); + ws->batch_size_swap = CHAMELEON_BATCH_SIZE; } ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); diff --git a/control/compute_z.h b/control/compute_z.h index b75c303a1131e5c9f142803e66a227fe0c04de91..1229a1797915be3d358cf018f3b779bca8aeefe2 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -43,17 +43,20 @@ struct chameleon_pzgemm_s { * @brief Data structure to handle the GETRF workspaces with partial pivoting */ struct chameleon_pzgetrf_s { - cham_getrf_t alg; - int ib; /**< Internal blocking parameter */ - int batch_size; /**< Batch size for the panel */ - int ringswitch; /**< Define when to switch to ring bcast */ - CHAM_desc_t U; - CHAM_desc_t Up; /**< Workspace used for the panel factorization */ - CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ - CHAM_desc_t Wl; /**< Workspace used the update */ - int *proc_involved; - unsigned int involved; - int np_involved; + cham_getrf_t alg; + cham_getrf_allreduce_t alg_allreduce; + int ib; /**< Internal blocking parameter */ + int batch_size_blas2; /**< Batch size for the blas 2 operations of the panel factorization */ + int batch_size_blas3; /**< Batch size for the blas 3 operations of the panel factorization */ + int batch_size_swap; /**< Batch size for the permutation */ + int ringswitch; /**< Define when to switch to ring bcast */ + CHAM_desc_t U; + CHAM_desc_t Up; /**< Workspace used for the panel factorization */ + CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ + CHAM_desc_t Wl; /**< Workspace used the update */ + int *proc_involved; + unsigned int involved; + int np_involved; }; /** diff --git a/control/descriptor_helpers.c b/control/descriptor_helpers.c index d5e1430638e8b507e4ac6163869d87b3021d97f4..6a0492111203815999797bc0d40243af1d6391d3 100644 --- a/control/descriptor_helpers.c +++ b/control/descriptor_helpers.c @@ -100,6 +100,26 @@ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k ) { return ( myrank % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) ); } +/** + * @brief Test if the MPI process p is involved in the panel k for 2DBC distributions. + * + * @param[in] A + * The matrix descriptor. + * + * @param[in] k + * The index of the panel to test. + * + * @param[in] p + * The rank of the MPI process. + * + * @return 1 if the current MPI process contributes to the panel k. + * 0 if the current MPI process doesn't contribute to the panel k. + * + */ +int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p ) { + return ( p % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) ); +} + /** * @brief Test if the current MPI process is involved in the panel k for 2DBC distributions. * diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 09bfc942f5fc1d2e829616b61b79b878324bf61c..88c1f653b66eaca7aa597e0e34de9b0b9ad416b2 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -290,6 +290,13 @@ typedef enum chameleon_getrf_e { ChamGetrfPPivPerColumn = 3, } cham_getrf_t; +/** + * @brief Chameleon GETRF all reduce algorithm variants + */ +typedef enum chameleon_getrf_allreduce_e { + ChamStarPUTasks, +} cham_getrf_allreduce_t; + #define ChameleonTrd 1001 #define ChameleonBrd 1002 diff --git a/include/chameleon/descriptor_helpers.h b/include/chameleon/descriptor_helpers.h index 9e60ef27dda1d76e3af3c8f5fe3a2b427ae719c5..f8caf508060d4a2f5aaaf7a1ef12ce43d31505de 100644 --- a/include/chameleon/descriptor_helpers.h +++ b/include/chameleon/descriptor_helpers.h @@ -64,6 +64,7 @@ int chameleon_getrankof_custom ( const CHAM_desc_t *A, int m, int n ); */ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int An ); +int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p ); void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int n, diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 5f1bbcd322293e3104fca313974096bcf711de71..bf3831af524b2cbea33377a4ff8ab4ce1e124bb6 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -575,13 +575,13 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ); + int n, + void *ws ); /** ******************************************************************************** @@ -600,6 +600,16 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, * @param[in] A * The descriptor of the matrix A. * + * @param[inout] U + * The descriptor of the worskpace used for the permutation in the LU + * factorization with partial pivoting. + * + * @param[in] Um + * The row index of the tile used in U. + * + * @param[in] Un + * The column index of the tile used in U. + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -613,16 +623,6 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, * @param[in] n * The number of columns in the tile U(Um, Un). * - * @param[inout] U - * The descriptor of the worskpace used for the permutation in the LU - * factorization with partial pivoting. - * - * @param[in] Um - * The row index of the tile used in U. - * - * @param[in] Un - * The column index of the tile used in U. - * * @param[in] ws * The workspace to handle the data in the LU factorization with * partial pivoting. @@ -631,13 +631,13 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, */ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ); /** diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c index b088283254cd64e1bada1628939436327b8a2789..197842ea3e96fdba1a9e1d67152a8a5b3e6196ea 100644 --- a/runtime/openmp/codelets/codelet_zipiv_allreduce.c +++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_openmp.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c index cb77c806bcb8ce47a62e7b4e19b2dad3dafc8218..7aeb24faebda059ad96dec2819b8793d467eae05 100644 --- a/runtime/openmp/codelets/codelet_zperm_allreduce.c +++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c index 75e0611647a464cad9c37e59a5619ebefaae19ed..d6bd3f4c06baf9b1c44e4db6971c88c09acd432f 100644 --- a/runtime/parsec/codelets/codelet_zipiv_allreduce.c +++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_parsec.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c index 30890f8114b857b7c12804c526f4aa4c875b63a1..5acfa4a2b099785e7397807309d104d5421c34fb 100644 --- a/runtime/parsec/codelets/codelet_zperm_allreduce.c +++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c index e88269e931f3f210282a1382d44a6ff9516c7453..0186fd142b67d08dcfca01e9b8184b471362ce1c 100644 --- a/runtime/quark/codelets/codelet_zipiv_allreduce.c +++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_quark.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c index 52281451dd038a9276a2040b9f4c08f7effa63f7..f6c5f98e6d59ed67db6ae9ca7dbe37abca31d617 100644 --- a/runtime/quark/codelets/codelet_zperm_allreduce.c +++ b/runtime/quark/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 2e04493df242f90fb18499abfab703724e90d197..011785aa2f459629adaabe457e77a908786ac14d 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -74,7 +74,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv ) { int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2; void (*callback)(void*) = NULL; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; int rankA = A->get_rankof( A, Am, An ); @@ -241,8 +241,9 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; - int ib = ((struct chameleon_pzgetrf_s *)ws)->ib; + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws; + int ib = tmp->ib; + int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3; int task_num = 0; void (*callback)(void*) = NULL; int accessU, access_npiv, access_ipiv, access_ppiv; diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c index a81f0d08eef1fb94b6846606b5e63aae64ab075c..48ecdd0c33fa07f9cfd326f775b0f31fb48a67b8 100644 --- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -22,18 +22,18 @@ struct cl_redux_args_t { int h; int n; - int k; }; -static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +static void +zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me, + cppi_interface_t *cppi_src, + int h, + int n ) { - struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; - cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); - cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); - CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); - CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); - CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); - CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); + CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); + CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); + CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); + CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); cppi_display_dbg( cppi_me, stderr, "Global redux Inout: "); cppi_display_dbg( cppi_src, stderr, "Global redux Input: "); @@ -43,33 +43,42 @@ static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) assert( cppi_me->flttype == cppi_src->flttype ); assert( cppi_me->arraysize == cppi_src->arraysize ); - if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) { + if ( cabs( pivrow_src[ h ] ) > cabs( pivrow_me[ h ] ) ) { nextpiv_me->blkm0 = nextpiv_src->blkm0; nextpiv_me->blkidx = nextpiv_src->blkidx; - cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 ); + cblas_zcopy( n, pivrow_src, 1, pivrow_me, 1 ); } /* Let's copy the diagonal row if needed */ if ( ( cppi_src->has_diag == 1 ) && ( cppi_me->has_diag == -1 ) ) { - cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); - assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * clargs->n ); + cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); + assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n ); cppi_me->has_diag = 1; } cppi_display_dbg( cppi_me, stderr, "Global redux Inout(After): "); } +static void +cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); + cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); + zipiv_allreduce_cpu_func( cppi_me, cppi_src, clargs->h, clargs->n ); +} + CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) -void -INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, - int me, - int dst, - int k, - int h, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h ) { rt_starpu_insert_task( NULL, @@ -79,20 +88,19 @@ INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, 0 ); } -void -INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, - int me, - int src, - int k, - int h, - int n, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n ) { struct cl_redux_args_t *clargs; - clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs = malloc( sizeof( struct cl_redux_args_t ) ); clargs->h = h; clargs->n = n; - clargs->k = k; rt_starpu_insert_task( &cl_zipiv_allreduce, @@ -106,16 +114,17 @@ INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); } -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +static void +zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) { - int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); - int np_iter = np_involved; + int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); + int np_iter = np_involved; int p_recv, p_send, me; int shift = 1; @@ -140,29 +149,48 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, p_send = proc_involved[ ( me + shift ) % np_involved ]; p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; - INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h, options ); - INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options ); + INSERT_TASK_zipiv_allreduce_send( options, ipiv, A->myrank, p_send, k, h ); + INSERT_TASK_zipiv_allreduce_recv( options, ipiv, A->myrank, p_recv, k, h, n ); shift = shift << 1; np_iter = chameleon_ceil( np_iter, 2 ); } } } + +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + cham_getrf_allreduce_t alg = tmp->alg_allreduce; + switch( alg ) { + case ChamStarPUTasks: + default: + zipiv_allreduce_chameleon_starpu_task( options, A, ipiv, tmp->proc_involved, k, h, n ); + } +} #else -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) { if ( h > 0 ) { starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); } (void)options; - (void)proc_involved; + (void)ws; (void)n; } #endif diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c index b17f26a486dc87e5d8dcb807369bfa431e809b06..303e6a674b564a9fbe3833931a5190af9e8ed136 100644 --- a/runtime/starpu/codelets/codelet_zlaswp_batched.c +++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c @@ -72,7 +72,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, void **clargs_ptr ) { int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap; int nhandles; struct cl_laswp_batched_args_t *clargs = *clargs_ptr; if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) { diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c index 4c33a2e5086199af65219e86189f47aba18c7755..1c8d44164e9a97dec5427f2a9d775cb7f28b9315 100644 --- a/runtime/starpu/codelets/codelet_zperm_allreduce.c +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -102,14 +102,14 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, { struct cl_redux_args_t *clargs; clargs = malloc( sizeof( struct cl_redux_args_t ) ); - clargs->tempmm = tempmm; - clargs->n = n; - clargs->p = p; - clargs->q = q; - clargs->p_first = p_first; - clargs->me = me; - clargs->shift = shift; - clargs->np_inv = np; + clargs->tempmm = tempmm; + clargs->n = n; + clargs->p = p; + clargs->q = q; + clargs->p_first = p_first; + clargs->me = me; + clargs->shift = shift; + clargs->np_inv = np; rt_starpu_insert_task( &cl_zperm_allreduce, @@ -124,20 +124,19 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) ); } -void -INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, - const CHAM_desc_t *A, - CHAM_ipiv_t *ipiv, - int ipivk, - int k, - int n, - CHAM_desc_t *U, - int Um, - int Un, - void *ws ) +static void +zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + struct chameleon_pzgetrf_s *ws) { - struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; - int *proc_involved = tmp->proc_involved; + int *proc_involved = ws->proc_involved; int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); int np_iter = np_involved; int p_recv, p_send, me, p_first; @@ -169,6 +168,27 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, } } +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + cham_getrf_allreduce_t alg = tmp->alg_allreduce; + switch( alg ) { + case ChamStarPUTasks: + default: + zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); + } +} + void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, CHAM_desc_t *A, @@ -284,24 +304,24 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } #endif