From 141d0f6e4aa5bdad67e5df24c3e351ab45d08495 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Tue, 5 Sep 2023 10:50:50 +0200 Subject: [PATCH] zgetrf: Add blocked version algorithm of the getrf --- compute/pzgetrf.c | 70 +++++++++++++++++++++++++++++++++++++++++++-- compute/zgetrf.c | 29 ++++++++++++++++--- control/compute_z.h | 3 +- 3 files changed, 95 insertions(+), 7 deletions(-) diff --git a/compute/pzgetrf.c b/compute/pzgetrf.c index 94dee2b5b..ccd05fbcc 100644 --- a/compute/pzgetrf.c +++ b/compute/pzgetrf.c @@ -16,7 +16,7 @@ * @author Mathieu Faverge * @author Emmanuel Agullo * @author Matthieu Kuhn - * @date 2023-08-31 + * @date 2023-09-08 * @precisions normal z -> s d c * */ @@ -24,6 +24,7 @@ #define A(m,n) A, m, n #define U(m,n) &(ws->U), m, n +#define Up(m,n) &(ws->Up), m, n /* * All the functions below are panel factorization variant. @@ -158,6 +159,71 @@ chameleon_pzgetrf_panel_facto_percol( struct chameleon_pzgetrf_s *ws, RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); } +static inline void +chameleon_pzgetrf_panel_facto_blocked( struct chameleon_pzgetrf_s *ws, + CHAM_desc_t *A, + CHAM_ipiv_t *ipiv, + int k, + RUNTIME_option_t *options ) +{ + int m, h, b, nbblock; + int tempkm, tempkn, minmn; + + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + minmn = chameleon_min( tempkm, tempkn ); + + /* Update the number of column */ + ipiv->n = minmn; + nbblock = chameleon_ceil( minmn, ws->ib ); + + /* + * Algorithm per column with pivoting + */ + for (b=0; b<nbblock; b++){ + int hmax = b == nbblock-1 ? minmn + 1 - b * ws->ib : ws->ib; + + for (h=0; h<hmax; h++){ + int j = h + b * ws->ib; + + INSERT_TASK_zgetrf_blocked_diag( + options, + j, k * A->mb, ws->ib, + A(k, k), Up(k, k), + ipiv ); + + for (m = k+1; m < A->mt; m++) { + + INSERT_TASK_zgetrf_blocked_offdiag( + options, + j, m * A->mb, ws->ib, + A(m, k), Up(k, k), + ipiv ); + } + + + if ( (b < (nbblock-1)) && (h == hmax-1) ) { + INSERT_TASK_zgetrf_blocked_trsm( + options, + ws->ib, tempkn, b * ws->ib + hmax, ws->ib, + Up(k, k), + ipiv ); + } + + assert( j<= minmn ); + if ( j < minmn ) { + /* Reduce globally (between MPI processes) */ + RUNTIME_ipiv_reducek( options, ipiv, k, j ); + } + } + } + RUNTIME_data_flush( options->sequence, Up(k, k) ); + + /* Flush temporary data used for the pivoting */ + INSERT_TASK_ipiv_to_perm( options, k * A->mb, tempkm, minmn, ipiv, k ); + RUNTIME_ipiv_flushk( options->sequence, ipiv, k ); +} + static inline void chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, CHAM_desc_t *A, @@ -176,7 +242,7 @@ chameleon_pzgetrf_panel_facto( struct chameleon_pzgetrf_s *ws, break; case ChamGetrfPPiv: - chameleon_pzgetrf_panel_facto_percol( ws, A, ipiv, k, options ); + chameleon_pzgetrf_panel_facto_blocked( ws, A, ipiv, k, options ); break; case ChamGetrfNoPiv: diff --git a/compute/zgetrf.c b/compute/zgetrf.c index 38ef81719..845b127b1 100644 --- a/compute/zgetrf.c +++ b/compute/zgetrf.c @@ -19,7 +19,7 @@ * @author Florent Pruvost * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-08-31 + * @date 2023-09-08 * * @precisions normal z -> s d c * @@ -88,6 +88,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) chameleon_cleanenv( algostr ); } + /* Allocation of U for permutation of the panels */ if ( ws->alg == ChamGetrfNoPivPerColumn ) { chameleon_desc_init( &(ws->U), CHAMELEON_MAT_ALLOC_TILE, ChamComplexDouble, 1, A->nb, A->nb, @@ -112,6 +113,17 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A ) ws->ib = 1; } + /* Allocation of Up for the permutation of the diagonal panel per block */ + if ( ws->alg == ChamGetrfPPiv ) { + /* TODO: Should be restricted to diagonal tiles */ + /* Possibly to a single handle with a permutation of the ownership */ + chameleon_desc_init( &(ws->Up), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, ws->ib, A->nb, ws->ib * A->nb, + A->mt * ws->ib, A->nt * A->nb, 0, 0, + A->mt * ws->ib, A->nt * A->nb, A->p, A->q, + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); + } + return ws; } @@ -145,6 +157,9 @@ CHAMELEON_zgetrf_WS_Free( void *user_ws ) { chameleon_desc_destroy( &(ws->U) ); } + if ( ws->alg == ChamGetrfPPiv ) { + chameleon_desc_destroy( &(ws->Up) ); + } free( ws ); } @@ -249,7 +264,9 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) /* Allocate workspace for partial pivoting */ ws = CHAMELEON_zgetrf_WS_Alloc( &descAt ); - if ( ws->alg == ChamGetrfPPivPerColumn ) { + if ( ( ws->alg == ChamGetrfPPivPerColumn ) || + ( ws->alg == ChamGetrfPPiv ) ) + { chameleon_ipiv_init( &descIPIV, &descAt, IPIV ); } @@ -260,13 +277,17 @@ CHAMELEON_zgetrf( int M, int N, CHAMELEON_Complex64_t *A, int LDA, int *IPIV ) chameleon_ztile2lap( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower, sequence, &request ); - if ( ws->alg == ChamGetrfPPivPerColumn ) { + if ( ( ws->alg == ChamGetrfPPivPerColumn ) || + ( ws->alg == ChamGetrfPPiv ) ) + { RUNTIME_ipiv_gather( &descIPIV, IPIV, 0 ); } chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ - if ( ws->alg == ChamGetrfPPivPerColumn ) { + if ( ( ws->alg == ChamGetrfPPivPerColumn ) || + ( ws->alg == ChamGetrfPPiv ) ) + { chameleon_ipiv_destroy( &descIPIV ); } CHAMELEON_zgetrf_WS_Free( ws ); diff --git a/control/compute_z.h b/control/compute_z.h index 9032c20f2..22b9f06f0 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -22,7 +22,7 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-08-22 + * @date 2023-09-08 * @precisions normal z -> c d s * */ @@ -45,6 +45,7 @@ struct chameleon_pzgetrf_s { cham_getrf_t alg; int ib; /* Internal blocking parameter */ CHAM_desc_t U; + CHAM_desc_t Up; }; /** -- GitLab