From 410358afefe23d4341569522592bbe5640777dd5 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Thu, 8 Aug 2024 16:59:08 +0200 Subject: [PATCH 1/6] zgetrf: Add all_reduce algorithm choice --- compute/zgetrf.c | 15 +++++++++++++++ control/compute_z.h | 23 ++++++++++++----------- include/chameleon/constants.h | 7 +++++++ 3 files changed, 34 insertions(+), 11 deletions(-) diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 514e89d3e..976ba2ad5 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -98,6 +98,21 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( algostr ); } + { + char *allreduce = chameleon_getenv( "CHAMELEON_GETRF_ALL_REDUCE" ); + + if ( allreduce != NULL ) { + if ( strcasecmp( allreduce, "cham_spu_tasks" ) == 0 ) { + ws->alg_allreduce = ChamStarPUTasks; + } + else { + chameleon_error( "CHAMELEON_zgetrf_WS_Alloc", "CHAMELEON_GETRF_ALL_REDUCE is not one of chameleon_starpu_tasks, chameleon_starpu, chameleon_starpu_mpi, chameleon_mpi => Switch back to chameleon_starpu_tasks\n" ); + ws->alg_allreduce = ChamStarPUTasks; + } + } + chameleon_cleanenv( allreduce ); + } + ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) { chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); diff --git a/control/compute_z.h b/control/compute_z.h index b75c303a1..855f820b1 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -43,17 +43,18 @@ struct chameleon_pzgemm_s { * @brief Data structure to handle the GETRF workspaces with partial pivoting */ struct chameleon_pzgetrf_s { - cham_getrf_t alg; - int ib; /**< Internal blocking parameter */ - int batch_size; /**< Batch size for the panel */ - int ringswitch; /**< Define when to switch to ring bcast */ - CHAM_desc_t U; - CHAM_desc_t Up; /**< Workspace used for the panel factorization */ - CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ - CHAM_desc_t Wl; /**< Workspace used the update */ - int *proc_involved; - unsigned int involved; - int np_involved; + cham_getrf_t alg; + cham_getrf_allreduce_t alg_allreduce; + int ib; /**< Internal blocking parameter */ + int batch_size; /**< Batch size for the panel */ + int ringswitch; /**< Define when to switch to ring bcast */ + CHAM_desc_t U; + CHAM_desc_t Up; /**< Workspace used for the panel factorization */ + CHAM_desc_t Wu; /**< Workspace used for the permutation and update */ + CHAM_desc_t Wl; /**< Workspace used the update */ + int *proc_involved; + unsigned int involved; + int np_involved; }; /** diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 09bfc942f..88c1f653b 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -290,6 +290,13 @@ typedef enum chameleon_getrf_e { ChamGetrfPPivPerColumn = 3, } cham_getrf_t; +/** + * @brief Chameleon GETRF all reduce algorithm variants + */ +typedef enum chameleon_getrf_allreduce_e { + ChamStarPUTasks, +} cham_getrf_allreduce_t; + #define ChameleonTrd 1001 #define ChameleonBrd 1002 -- GitLab From 6ee85f43a4a7a0711f9873d8c02e71e040863d53 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Fri, 25 Oct 2024 11:32:49 +0200 Subject: [PATCH 2/6] zgetrf: zipiv allreduce MPI in task prepare codelet --- compute/pzgetrf.c | 8 +- include/chameleon/tasks_z.h | 8 +- .../openmp/codelets/codelet_zipiv_allreduce.c | 12 +- .../parsec/codelets/codelet_zipiv_allreduce.c | 12 +- .../quark/codelets/codelet_zipiv_allreduce.c | 12 +- .../starpu/codelets/codelet_zipiv_allreduce.c | 126 +++++++++++------- 6 files changed, 103 insertions(+), 75 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 6db9a8a40..4347f5710 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -150,7 +150,7 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, } /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } /* Flush temporary data used for the pivoting */ @@ -196,7 +196,7 @@ chameleon_pzgetrf_panel_facto_percol_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, k, clargs, ipiv ); - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, h, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, h, tempkn, ws ); } free( clargs ); @@ -250,7 +250,7 @@ chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( ( b < (nbblock-1) ) && ( h == hmax-1 ) ) { INSERT_TASK_zgetrf_blocked_trsm( @@ -312,7 +312,7 @@ chameleon_pzgetrf_panel_facto_blocked_batched( struct chameleon_pzgetrf_s *ws, assert( j <= minmn ); /* Reduce globally (between MPI processes) */ - INSERT_TASK_zipiv_allreduce( A, options, ipiv, ws->proc_involved, k, j, tempkn ); + INSERT_TASK_zipiv_allreduce( options, A, ipiv, k, j, tempkn, ws ); if ( (b < (nbblock-1)) && (h == hmax-1) ) { INSERT_TASK_zgetrf_blocked_trsm( diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 5f1bbcd32..402c92a3f 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -575,13 +575,13 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options, CHAM_desc_t *U, int Um, int Un, CHAM_ipiv_t *ws ); -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ); + int n, + void *ws ); /** ******************************************************************************** diff --git a/runtime/openmp/codelets/codelet_zipiv_allreduce.c b/runtime/openmp/codelets/codelet_zipiv_allreduce.c index b08828325..197842ea3 100644 --- a/runtime/openmp/codelets/codelet_zipiv_allreduce.c +++ b/runtime/openmp/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_openmp.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/parsec/codelets/codelet_zipiv_allreduce.c b/runtime/parsec/codelets/codelet_zipiv_allreduce.c index 75e061164..d6bd3f4c0 100644 --- a/runtime/parsec/codelets/codelet_zipiv_allreduce.c +++ b/runtime/parsec/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_parsec.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/quark/codelets/codelet_zipiv_allreduce.c b/runtime/quark/codelets/codelet_zipiv_allreduce.c index e88269e93..0186fd142 100644 --- a/runtime/quark/codelets/codelet_zipiv_allreduce.c +++ b/runtime/quark/codelets/codelet_zipiv_allreduce.c @@ -17,19 +17,19 @@ */ #include "chameleon_quark.h" -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, +void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, CHAM_ipiv_t *ipiv, - int *proc_involved, int k, int h, - int n ) + int n, + void *ws ) { - (void)A; (void)options; + (void)A; (void)ipiv; - (void)proc_involved; (void)k; (void)h; (void)n; + (void)ws; } diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c index a81f0d08e..48ecdd0c3 100644 --- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c +++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c @@ -22,18 +22,18 @@ struct cl_redux_args_t { int h; int n; - int k; }; -static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +static void +zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me, + cppi_interface_t *cppi_src, + int h, + int n ) { - struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; - cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); - cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); - CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); - CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); - CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); - CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); + CHAM_pivot_t *nextpiv_me = &(cppi_me->pivot); + CHAM_pivot_t *nextpiv_src = &(cppi_src->pivot); + CHAMELEON_Complex64_t *pivrow_me = (CHAMELEON_Complex64_t *)(nextpiv_me->pivrow); + CHAMELEON_Complex64_t *pivrow_src = (CHAMELEON_Complex64_t *)(nextpiv_src->pivrow); cppi_display_dbg( cppi_me, stderr, "Global redux Inout: "); cppi_display_dbg( cppi_src, stderr, "Global redux Input: "); @@ -43,33 +43,42 @@ static void cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) assert( cppi_me->flttype == cppi_src->flttype ); assert( cppi_me->arraysize == cppi_src->arraysize ); - if ( cabs( pivrow_src[ clargs->h ] ) > cabs( pivrow_me[ clargs->h ] ) ) { + if ( cabs( pivrow_src[ h ] ) > cabs( pivrow_me[ h ] ) ) { nextpiv_me->blkm0 = nextpiv_src->blkm0; nextpiv_me->blkidx = nextpiv_src->blkidx; - cblas_zcopy( clargs->n, pivrow_src, 1, pivrow_me, 1 ); + cblas_zcopy( n, pivrow_src, 1, pivrow_me, 1 ); } /* Let's copy the diagonal row if needed */ if ( ( cppi_src->has_diag == 1 ) && ( cppi_me->has_diag == -1 ) ) { - cblas_zcopy( clargs->n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); - assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * clargs->n ); + cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 ); + assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n ); cppi_me->has_diag = 1; } cppi_display_dbg( cppi_me, stderr, "Global redux Inout(After): "); } +static void +cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_redux_args_t *clargs = (struct cl_redux_args_t *) cl_arg; + cppi_interface_t *cppi_me = ((cppi_interface_t *) descr[0]); + cppi_interface_t *cppi_src = ((cppi_interface_t *) descr[1]); + zipiv_allreduce_cpu_func( cppi_me, cppi_src, clargs->h, clargs->n ); +} + CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func ) -void -INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, - int me, - int dst, - int k, - int h, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int dst, + int k, + int h ) { rt_starpu_insert_task( NULL, @@ -79,20 +88,19 @@ INSERT_TASK_zipiv_allreduce_send( CHAM_ipiv_t *ipiv, 0 ); } -void -INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, - int me, - int src, - int k, - int h, - int n, - const RUNTIME_option_t *options ) +static void +INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options, + CHAM_ipiv_t *ipiv, + int me, + int src, + int k, + int h, + int n ) { struct cl_redux_args_t *clargs; - clargs = malloc( sizeof( struct cl_redux_args_t ) ); + clargs = malloc( sizeof( struct cl_redux_args_t ) ); clargs->h = h; clargs->n = n; - clargs->k = k; rt_starpu_insert_task( &cl_zipiv_allreduce, @@ -106,16 +114,17 @@ INSERT_TASK_zipiv_allreduce_recv( CHAM_ipiv_t *ipiv, starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) ); } -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +static void +zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int *proc_involved, + int k, + int h, + int n ) { - int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); - int np_iter = np_involved; + int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); + int np_iter = np_involved; int p_recv, p_send, me; int shift = 1; @@ -140,29 +149,48 @@ void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, p_send = proc_involved[ ( me + shift ) % np_involved ]; p_recv = proc_involved[ ( me - shift + np_involved ) % np_involved ]; - INSERT_TASK_zipiv_allreduce_send( ipiv, A->myrank, p_send, k, h, options ); - INSERT_TASK_zipiv_allreduce_recv( ipiv, A->myrank, p_recv, k, h, n, options ); + INSERT_TASK_zipiv_allreduce_send( options, ipiv, A->myrank, p_send, k, h ); + INSERT_TASK_zipiv_allreduce_recv( options, ipiv, A->myrank, p_recv, k, h, n ); shift = shift << 1; np_iter = chameleon_ceil( np_iter, 2 ); } } } + +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + cham_getrf_allreduce_t alg = tmp->alg_allreduce; + switch( alg ) { + case ChamStarPUTasks: + default: + zipiv_allreduce_chameleon_starpu_task( options, A, ipiv, tmp->proc_involved, k, h, n ); + } +} #else -void INSERT_TASK_zipiv_allreduce( CHAM_desc_t *A, - const RUNTIME_option_t *options, - CHAM_ipiv_t *ipiv, - int *proc_involved, - int k, - int h, - int n ) +void +INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int h, + int n, + void *ws ) { if ( h > 0 ) { starpu_data_invalidate_submit( RUNTIME_pivot_getaddr( ipiv, A->myrank, k, h-1 ) ); } (void)options; - (void)proc_involved; + (void)ws; (void)n; } #endif -- GitLab From 488866d4968e6d2b0bf44eefe5a8004684f27638 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Mon, 21 Oct 2024 14:03:18 +0200 Subject: [PATCH 3/6] zgetrf: zperm allreduce MPI in task prepare codelet --- compute/pzgetrf.c | 5 +- include/chameleon/tasks_z.h | 26 +++---- .../openmp/codelets/codelet_zperm_allreduce.c | 12 +-- .../parsec/codelets/codelet_zperm_allreduce.c | 12 +-- .../quark/codelets/codelet_zperm_allreduce.c | 12 +-- .../starpu/codelets/codelet_zperm_allreduce.c | 74 ++++++++++++------- 6 files changed, 80 insertions(+), 61 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 4347f5710..ca7f4d120 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -433,8 +433,7 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, ipiv, k, A(k, n), A(m, n) ); } - INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, - Wu(A->myrank, n), ws ); + INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); } break; default: @@ -499,7 +498,7 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } INSERT_TASK_zlaswp_batched_flush( options, ipiv, k, A(k, n), Wu(A->myrank, n), clargs ); - INSERT_TASK_zperm_allreduce( options, A, ipiv, k, k, n, Wu(A->myrank, n), ws ); + INSERT_TASK_zperm_allreduce( options, A, Wu(A->myrank, n), ipiv, k, k, n, ws ); free( clargs ); } diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 402c92a3f..bf3831af5 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -600,6 +600,16 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, * @param[in] A * The descriptor of the matrix A. * + * @param[inout] U + * The descriptor of the worskpace used for the permutation in the LU + * factorization with partial pivoting. + * + * @param[in] Um + * The row index of the tile used in U. + * + * @param[in] Un + * The column index of the tile used in U. + * * @param[in] ipiv * The pivot structure that contains the informations for the LU * factorization with partial pivoting. @@ -613,16 +623,6 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, * @param[in] n * The number of columns in the tile U(Um, Un). * - * @param[inout] U - * The descriptor of the worskpace used for the permutation in the LU - * factorization with partial pivoting. - * - * @param[in] Um - * The row index of the tile used in U. - * - * @param[in] Un - * The column index of the tile used in U. - * * @param[in] ws * The workspace to handle the data in the LU factorization with * partial pivoting. @@ -631,13 +631,13 @@ void INSERT_TASK_zipiv_allreduce( const RUNTIME_option_t *options, */ void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ); /** diff --git a/runtime/openmp/codelets/codelet_zperm_allreduce.c b/runtime/openmp/codelets/codelet_zperm_allreduce.c index cb77c806b..7aeb24fae 100644 --- a/runtime/openmp/codelets/codelet_zperm_allreduce.c +++ b/runtime/openmp/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/parsec/codelets/codelet_zperm_allreduce.c b/runtime/parsec/codelets/codelet_zperm_allreduce.c index 30890f811..5acfa4a2b 100644 --- a/runtime/parsec/codelets/codelet_zperm_allreduce.c +++ b/runtime/parsec/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/quark/codelets/codelet_zperm_allreduce.c b/runtime/quark/codelets/codelet_zperm_allreduce.c index 52281451d..f6c5f98e6 100644 --- a/runtime/quark/codelets/codelet_zperm_allreduce.c +++ b/runtime/quark/codelets/codelet_zperm_allreduce.c @@ -71,23 +71,23 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c index 4c33a2e50..1c8d44164 100644 --- a/runtime/starpu/codelets/codelet_zperm_allreduce.c +++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c @@ -102,14 +102,14 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, { struct cl_redux_args_t *clargs; clargs = malloc( sizeof( struct cl_redux_args_t ) ); - clargs->tempmm = tempmm; - clargs->n = n; - clargs->p = p; - clargs->q = q; - clargs->p_first = p_first; - clargs->me = me; - clargs->shift = shift; - clargs->np_inv = np; + clargs->tempmm = tempmm; + clargs->n = n; + clargs->p = p; + clargs->q = q; + clargs->p_first = p_first; + clargs->me = me; + clargs->shift = shift; + clargs->np_inv = np; rt_starpu_insert_task( &cl_zperm_allreduce, @@ -124,20 +124,19 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options, starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) ); } -void -INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, - const CHAM_desc_t *A, - CHAM_ipiv_t *ipiv, - int ipivk, - int k, - int n, - CHAM_desc_t *U, - int Um, - int Un, - void *ws ) +static void +zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + struct chameleon_pzgetrf_s *ws) { - struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; - int *proc_involved = tmp->proc_involved; + int *proc_involved = ws->proc_involved; int np_involved = chameleon_min( chameleon_desc_datadist_get_iparam(A, 0), A->mt - k); int np_iter = np_involved; int p_recv, p_send, me, p_first; @@ -169,6 +168,27 @@ INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, } } +void +INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, + const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, + CHAM_ipiv_t *ipiv, + int ipivk, + int k, + int n, + void *ws ) +{ + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *)ws; + cham_getrf_allreduce_t alg = tmp->alg_allreduce; + switch( alg ) { + case ChamStarPUTasks: + default: + zperm_allreduce_chameleon_starpu_task( options, A, U, Um, Un, ipiv, ipivk, k, n, tmp ); + } +} + void INSERT_TASK_zperm_allreduce_send_A( const RUNTIME_option_t *options, CHAM_desc_t *A, @@ -284,24 +304,24 @@ INSERT_TASK_zperm_allreduce_send_invp( const RUNTIME_option_t *options, void INSERT_TASK_zperm_allreduce( const RUNTIME_option_t *options, const CHAM_desc_t *A, + CHAM_desc_t *U, + int Um, + int Un, CHAM_ipiv_t *ipiv, int ipivk, int k, int n, - CHAM_desc_t *U, - int Um, - int Un, void *ws ) { (void)options; (void)A; + (void)U; + (void)Um; + (void)Un; (void)ipiv; (void)ipivk; (void)k; (void)n; - (void)U; - (void)Um; - (void)Un; (void)ws; } #endif -- GitLab From 77a4aaebbb4f39d9af9c20c5448f8b1827eef242 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Wed, 11 Dec 2024 13:13:37 +0100 Subject: [PATCH 4/6] zgetrf: different values for batch size (blas 2 operations, blas 3 operations and swap) --- compute/pzgetrf.c | 6 +++--- compute/zgetrf.c | 18 ++++++++++++++---- control/compute_z.h | 4 +++- .../starpu/codelets/codelet_zgetrf_batched.c | 7 ++++--- .../starpu/codelets/codelet_zlaswp_batched.c | 2 +- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index ca7f4d120..98f9d0470 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -350,7 +350,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPivPerColumn: - if ( ws->batch_size > 0 ) { + if ( ws->batch_size_blas2 > 0 ) { chameleon_pzgetrf_panel_facto_percol_batched( ws, A, ipiv, k, options ); } else { @@ -359,7 +359,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPiv: - if ( ws->batch_size > 0 ) { + if ( ws->batch_size_blas2 > 0 ) { chameleon_pzgetrf_panel_facto_blocked_batched( ws, A, ipiv, k, options ); } else { @@ -583,7 +583,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - if ( ws->batch_size > 0 ) { + if ( ws->batch_size_swap > 0 ) { chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); } else { diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 976ba2ad5..254020a55 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -113,10 +113,20 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( allreduce ); } - ws->batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 ); - if ( ws->batch_size > CHAMELEON_BATCH_SIZE ) { - chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" ); - ws->batch_size = CHAMELEON_BATCH_SIZE; + ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", 0 ); + if ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS2 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS2 value\n" ); + ws->batch_size_blas2 = CHAMELEON_BATCH_SIZE; + } + ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", 0 ); + if ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS3 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS3 value\n" ); + ws->batch_size_blas3 = CHAMELEON_BATCH_SIZE; + } + ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", 0 ); + if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) { + chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_SWAP must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_SWAP value\n" ); + ws->batch_size_swap = CHAMELEON_BATCH_SIZE; } ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX ); diff --git a/control/compute_z.h b/control/compute_z.h index 855f820b1..1229a1797 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -46,7 +46,9 @@ struct chameleon_pzgetrf_s { cham_getrf_t alg; cham_getrf_allreduce_t alg_allreduce; int ib; /**< Internal blocking parameter */ - int batch_size; /**< Batch size for the panel */ + int batch_size_blas2; /**< Batch size for the blas 2 operations of the panel factorization */ + int batch_size_blas3; /**< Batch size for the blas 3 operations of the panel factorization */ + int batch_size_swap; /**< Batch size for the permutation */ int ringswitch; /**< Define when to switch to ring bcast */ CHAM_desc_t U; CHAM_desc_t Up; /**< Workspace used for the panel factorization */ diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c index 2e04493df..011785aa2 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_batched.c +++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c @@ -74,7 +74,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options, CHAM_ipiv_t *ipiv ) { int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2; void (*callback)(void*) = NULL; struct cl_getrf_batched_args_t *clargs = *clargs_ptr; int rankA = A->get_rankof( A, Am, An ); @@ -241,8 +241,9 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options, void **clargs_ptr, CHAM_ipiv_t *ipiv ) { - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; - int ib = ((struct chameleon_pzgetrf_s *)ws)->ib; + struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws; + int ib = tmp->ib; + int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3; int task_num = 0; void (*callback)(void*) = NULL; int accessU, access_npiv, access_ipiv, access_ppiv; diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c index b17f26a48..303e6a674 100644 --- a/runtime/starpu/codelets/codelet_zlaswp_batched.c +++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c @@ -72,7 +72,7 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options, void **clargs_ptr ) { int task_num = 0; - int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size; + int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap; int nhandles; struct cl_laswp_batched_args_t *clargs = *clargs_ptr; if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) { -- GitLab From 470e5c5fd3f1e6a68c1feb5079dbc4a0c3572113 Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Tue, 7 Jan 2025 12:28:27 +0100 Subject: [PATCH 5/6] zgetrf: clean permutation --- compute/pzgetrf.c | 160 ++++++++++++++++++++++++++-------------------- 1 file changed, 89 insertions(+), 71 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 98f9d0470..7081a1f7f 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -338,10 +338,12 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, int k, RUNTIME_option_t *options ) { +#if defined(CHAMELEON_USE_MPI) chameleon_get_proc_involved_in_panelk_2dbc( A, k, k, ws ); if ( !ws->involved ) { return; } +#endif /* TODO: Should be replaced by a function pointer */ switch( ws->alg ) { @@ -392,19 +394,6 @@ chameleon_pzgetrf_panel_permute( struct chameleon_pzgetrf_s *ws, int tempkm, tempkn, tempnn, minmn; int withlacpy; - chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); - } - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); - } - - if ( !ws->involved ) { - return; - } - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempkn = A->get_blkdim( A, k, DIM_n, A->n ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); @@ -457,19 +446,6 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, int tempkm, tempkn, tempnn, minmn; int withlacpy; - chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); - INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); - } - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); - } - - if ( !ws->involved ) { - return; - } - void **clargs = malloc( sizeof(char *) ); *clargs = NULL; @@ -508,6 +484,80 @@ chameleon_pzgetrf_panel_permute_batched( struct chameleon_pzgetrf_s *ws, } } +static inline void +chameleon_pzgetrf_panel_permute_forward( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options ) +{ +#if defined(CHAMELEON_USE_MPI) + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } +#endif + + if ( ws->batch_size_swap > 0 ) { + chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); + } + else { + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + } +} + +static inline void +chameleon_pzgetrf_panel_permute_backward( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + int n, + RUNTIME_option_t *options, + RUNTIME_sequence_t *sequence ) +{ + int tempkm, tempnn; + +#if defined(CHAMELEON_USE_MPI) + chameleon_get_proc_involved_in_panelk_2dbc( A, k, n, ws ); + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + INSERT_TASK_zperm_allreduce_send_perm( options, ipiv, k, A->myrank, ws->np_involved, ws->proc_involved ); + INSERT_TASK_zperm_allreduce_send_invp( options, ipiv, k, A, k, n ); + } + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + INSERT_TASK_zperm_allreduce_send_A( options, A, k, n, A->myrank, ws->np_involved, ws->proc_involved ); + } + + if ( !ws->involved ) { + return; + } +#endif + + if ( ws->batch_size_swap > 0 ) { + chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); + } + else { + chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); + } + + if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + tempnn = A->get_blkdim( A, n, DIM_n, A->n ); + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempnn, + Wu(A->myrank, n), A(k, n) ); + RUNTIME_data_flush( sequence, A(k, n) ); + } +} + static inline void chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -515,7 +565,7 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, RUNTIME_option_t *options ) { CHAM_context_t *chamctxt = chameleon_context_self(); - int m, tempmm, tempkn, q; + int m, n, tempmm, tempkn, q; int lookahead = chamctxt->lookahead; int P = chameleon_desc_datadist_get_iparam(A, 0); int Q = chameleon_desc_datadist_get_iparam(A, 1); @@ -583,12 +633,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, tempkm = A->get_blkdim( A, k, DIM_m, A->m ); tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - if ( ws->batch_size_swap > 0 ) { - chameleon_pzgetrf_panel_permute_batched( ws, A, ipiv, k, n, options ); - } - else { - chameleon_pzgetrf_panel_permute( ws, A, ipiv, k, n, options ); - } + chameleon_pzgetrf_panel_permute_forward( ws, A, ipiv, k, n, options ); if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { for ( p = 0; p < ws->np_involved; p++ ) { @@ -607,6 +652,7 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, tempkm, tempnn, A->mb, zone, A(k, k), + zone, Wu(A->myrank, k), Wu(A->myrank, n) ); } @@ -682,54 +728,26 @@ void chameleon_pzgetrf( struct chameleon_pzgetrf_s *ws, } /* Flush panel k */ - for (m = k; m < A->mt; m++) { + for (m = k+1; m < A->mt; m++) { RUNTIME_data_flush( sequence, A(m, k) ); } + RUNTIME_data_flush( sequence, Wu(A->myrank, k) ); RUNTIME_iteration_pop( chamctxt ); } CHAMELEON_Desc_Flush( &(ws->Wl), sequence ); /* Backward pivoting */ - if ( ws->batch_size > 0 ) { - for (k = 1; k < min_mnt; k++) { - for (n = 0; n < k; n++) { - if ( chameleon_involved_in_panelk_2dbc( A, k ) || - chameleon_involved_in_panelk_2dbc( A, n ) ) - { - chameleon_pzgetrf_panel_permute_batched( ws, A, IPIV, k, n, &options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); - tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, - Wu(A->myrank, n), A(k, n) ); - RUNTIME_data_flush( sequence, A(k, n) ); - } - } - RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); - } - RUNTIME_perm_flushk( sequence, IPIV, k ); - } - } - else { - for (k = 1; k < min_mnt; k++) { - for (n = 0; n < k; n++) { - if ( chameleon_involved_in_panelk_2dbc( A, k ) || - chameleon_involved_in_panelk_2dbc( A, n ) ) - { - chameleon_pzgetrf_panel_permute( ws, A, IPIV, k, n, &options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, n ) ) { - tempkm = A->get_blkdim( A, k, DIM_m, A->m ); - tempnn = A->get_blkdim( A, n, DIM_n, A->n ); - INSERT_TASK_zlacpy( &options, ChamUpperLower, tempkm, tempnn, - Wu(A->myrank, n), A(k, n) ); - RUNTIME_data_flush( sequence, A(k, n) ); - } - } - RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); + for (k = 1; k < min_mnt; k++) { + for (n = 0; n < k; n++) { + if ( chameleon_involved_in_panelk_2dbc( A, k ) || + chameleon_involved_in_panelk_2dbc( A, n ) ) + { + chameleon_pzgetrf_panel_permute_backward( ws, A, IPIV, k, n, &options, sequence ); } - RUNTIME_perm_flushk( sequence, IPIV, k ); + RUNTIME_data_flush( sequence, Wu(A->myrank, n) ); } + RUNTIME_perm_flushk( sequence, IPIV, k ); } CHAMELEON_Desc_Flush( &(ws->Wu), sequence ); -- GitLab From 0e64bc8b2455a84da029f732e170c12d66a05e8a Mon Sep 17 00:00:00 2001 From: Alycia Lisito <alycia.lisito@inria.fr> Date: Thu, 13 Feb 2025 17:43:55 +0100 Subject: [PATCH 6/6] zgetrf: send Akk before allreduce --- compute/pzgetrf.c | 58 +++++++++++++++++++------- control/descriptor_helpers.c | 20 +++++++++ include/chameleon/descriptor_helpers.h | 1 + 3 files changed, 64 insertions(+), 15 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 7081a1f7f..635bbbb84 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -565,7 +565,7 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, RUNTIME_option_t *options ) { CHAM_context_t *chamctxt = chameleon_context_self(); - int m, n, tempmm, tempkn, q; + int m, n, tempmm, tempkn, tempkm, p, q, involved, np; int lookahead = chamctxt->lookahead; int P = chameleon_desc_datadist_get_iparam(A, 0); int Q = chameleon_desc_datadist_get_iparam(A, 1); @@ -610,6 +610,44 @@ chameleon_pzgetrf_panel_update_ws( struct chameleon_pzgetrf_s *ws, RUNTIME_data_flush( options->sequence, A(m, k) ); } } + + tempkm = A->get_blkdim( A, k, DIM_m, A->m ); + np = chameleon_desc_datadist_get_iparam(A, 1) * chameleon_desc_datadist_get_iparam(A, 0); +#if defined(CHAMELEON_USE_MPI) + /* Send Akk for replicated trsm */ + if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { + for ( p = 0; p < np; p++ ) { + involved = 0; + for ( n = k+1; n < A->nt; n++ ) { + if ( chameleon_p_involved_in_panelk_2dbc( A, n, p ) ) { + involved = 1; + break; + } + } + if ( involved ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(p, k) ); + } + } + } + else { + involved = 0; + for ( n = k+1; n < A->nt; n++ ) { + if ( chameleon_involved_in_panelk_2dbc( A, n ) ) { + involved = 1; + break; + } + } + if ( involved ) { + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(A->myrank, k) ); + } + } +#else + INSERT_TASK_zlacpy( options, ChamUpperLower, tempkm, tempkn, + A(k, k), Wu(A->myrank, k) ); +#endif + RUNTIME_data_flush( options->sequence, A(k, k) ); } static inline void @@ -635,23 +673,14 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, chameleon_pzgetrf_panel_permute_forward( ws, A, ipiv, k, n, options ); - if ( A->myrank == chameleon_getrankof_2d( A, k, k ) ) { - for ( p = 0; p < ws->np_involved; p++ ) { - INSERT_TASK_ztrsm( - options, - ChamLeft, ChamLower, ChamNoTrans, ChamUnit, - tempkm, tempnn, A->mb, - zone, A(k, k), - Wu(ws->proc_involved[p], n) ); - RUNTIME_data_flush( options->sequence, Wu(ws->proc_involved[p], n) ); - } - } - else if ( ws->involved ) { +#if defined(CHAMELEON_USE_MPI) + if ( ws->involved ) +#endif + { INSERT_TASK_ztrsm( options, ChamLeft, ChamLower, ChamNoTrans, ChamUnit, tempkm, tempnn, A->mb, - zone, A(k, k), zone, Wu(A->myrank, k), Wu(A->myrank, n) ); } @@ -677,7 +706,6 @@ chameleon_pzgetrf_panel_update( struct chameleon_pzgetrf_s *ws, } RUNTIME_data_flush( options->sequence, Wu(A->myrank, n) ); - RUNTIME_data_flush( options->sequence, A(k, k) ); RUNTIME_data_flush( options->sequence, A(k, n) ); } diff --git a/control/descriptor_helpers.c b/control/descriptor_helpers.c index d5e143063..6a0492111 100644 --- a/control/descriptor_helpers.c +++ b/control/descriptor_helpers.c @@ -100,6 +100,26 @@ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k ) { return ( myrank % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) ); } +/** + * @brief Test if the MPI process p is involved in the panel k for 2DBC distributions. + * + * @param[in] A + * The matrix descriptor. + * + * @param[in] k + * The index of the panel to test. + * + * @param[in] p + * The rank of the MPI process. + * + * @return 1 if the current MPI process contributes to the panel k. + * 0 if the current MPI process doesn't contribute to the panel k. + * + */ +int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p ) { + return ( p % chameleon_desc_datadist_get_iparam(A,1) == k % chameleon_desc_datadist_get_iparam(A,1) ); +} + /** * @brief Test if the current MPI process is involved in the panel k for 2DBC distributions. * diff --git a/include/chameleon/descriptor_helpers.h b/include/chameleon/descriptor_helpers.h index 9e60ef27d..f8caf5080 100644 --- a/include/chameleon/descriptor_helpers.h +++ b/include/chameleon/descriptor_helpers.h @@ -64,6 +64,7 @@ int chameleon_getrankof_custom ( const CHAM_desc_t *A, int m, int n ); */ int chameleon_involved_in_panelk_2dbc( const CHAM_desc_t *A, int An ); +int chameleon_p_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int p ); void chameleon_get_proc_involved_in_panelk_2dbc( const CHAM_desc_t *A, int k, int n, -- GitLab