From 945e4dec3888f7dcaccf6ac992e7df69eab3a572 Mon Sep 17 00:00:00 2001 From: Ana Hourcau <ana.hourcau@inria.fr> Date: Wed, 15 May 2024 15:51:21 +0200 Subject: [PATCH] compute: Adding hered algorithms for reduction on hermitian matrices --- compute/CMakeLists.txt | 5 +- compute/pzhered.c | 288 ++++++++++++++++++++++++++++++++ compute/zhered.c | 173 +++++++++++++++++++ control/compute_z.h | 5 +- include/chameleon/chameleon_z.h | 2 + 5 files changed, 471 insertions(+), 2 deletions(-) create mode 100644 compute/pzhered.c create mode 100644 compute/zhered.c diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 6dfa83891..cc25fb7d2 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -27,7 +27,8 @@ # @author Alycia Lisito # @author Loris Lucido # @author Matthieu Kuhn -# @date 2024-04-03 +# @author Ana Hourcau +# @date 2024-07-17 # ### @@ -193,10 +194,12 @@ set(ZSRC ################## # MIXED PRECISION ################## + pzhered.c pzlag2c.c pzgered.c pzgerst.c ### + zhered.c zgered.c zgerst.c #zcgels.c diff --git a/compute/pzhered.c b/compute/pzhered.c new file mode 100644 index 000000000..97e171b3f --- /dev/null +++ b/compute/pzhered.c @@ -0,0 +1,288 @@ +/** + * + * @file pzhered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zhered parallel algorithm + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Ana Hourcau + * @date 2024-07-17 + * @precisions normal z -> z d + * + */ +// ALLOC_WS : A->mb +// ALLOC_WS : A->nb +// WS_ADD : A->mb + A->nb +#include "control/common.h" +#include <coreblas/lapacke.h> + +#define A(m, n) A, (m), (n) +#define W(desc, m, n) (desc), (m), (n) + +static inline void +chameleon_pzhered_frb( cham_trans_t trans, cham_uplo_t uplo, + CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, + RUNTIME_option_t *options ) +{ + double alpha = 1.0; + double beta = 0.0; + + int m, n; + int MT = A->mt; + int NT = A->nt; + int M = A->m; + int N = A->n; + int P = Welt->p; + int Q = Welt->q; + + /* Initialize workspaces for tile norms */ + for (m = 0; m < Wnorm->mt; m++) + { + for (n = 0; n < NT; n++) + { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Wnorm->mb, Wnorm->nb, + alpha, beta, + W(Wnorm, m, n)); + } + } + + /* Initialize workspaces */ + for (m = 0; m < Welt->mt; m++) + { + for (n = 0; n < Welt->nt; n++) + { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Welt->mb, Welt->nb, + alpha, beta, + W(Welt, m, n)); + } + } + + /** + * Step 1: + * For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) ) + */ + for (m = 0; m < MT; m++) + { + int nmin = (uplo == ChamUpper) ? m : 0; + int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT; + + int tempmm = (m == (MT - 1)) ? M - m * A->mb : A->mb; + + for (n = nmin; n < nmax; n++) + { + int tempnn = (n == (NT - 1)) ? N - n * A->nb : A->nb; + + if (n == m) + { + if ( trans == ChamConjTrans ) { + INSERT_TASK_zhessq( + options, ChamEltwise, uplo, tempmm, + A(m, n), W( Wnorm, m, n) ); + } + else { + INSERT_TASK_zsyssq( + options, ChamEltwise, uplo, tempmm, + A(m, n), W( Wnorm, m, n) ); + } + } + else + { + INSERT_TASK_zgessq( + options, ChamEltwise, tempmm, tempnn, + A(m, n), W( Wnorm, m, n )); + INSERT_TASK_zgessq( + options, ChamEltwise, tempmm, tempnn, + A(m, n), W( Wnorm, n, m )); + } + } + } + + for(m = 0; m < MT; m++) { + for(n = Q; n < NT; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) ); + } + + /** + * Step 2: + * For each j, W(m, j) = reduce( W( Welt, m, 0..Q-1) ) + */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) ); + } + } + + /** + * Step 3: + * For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) ) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) ); + } + + /** + * Step 4: + * For each i, Welt(i, n) = max( Welt(0..P-1, n) ) + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) ); + } + + /* Compute the norm of each tile, and the full norm */ + for (m = 0; m < MT; m++) + { + int nmin = (uplo == ChamUpper) ? m : 0; + int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, NT) : NT; + + for (n = nmin; n < nmax; n++) + { + /* Compute the final norm of the tile */ + INSERT_TASK_dplssq2( + options, 1, W( Wnorm, m, n ) ); + } + } + INSERT_TASK_dplssq2( + options, 1, W( Welt, 0, 0) ); + + /** + * Broadcast the result + */ + for (m = 0; m < A->p; m++) + { + for (n = 0; n < A->q; n++) + { + if ((m != 0) || (n != 0)) + { + INSERT_TASK_dlacpy( + options, + ChamUpperLower, 1, 1, + W(Welt, 0, 0), W(Welt, m, n)); + } + } + } +} + +/** + * + */ +void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t Wcol; + CHAM_desc_t Welt; + double gnorm, lnorm, threshold, eps; + + int workmt, worknt; + int m, n; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) + { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + workmt = chameleon_max(A->mt, A->p); + worknt = chameleon_max(A->nt, A->q); + + RUNTIME_options_ws_alloc(&options, 1, 0); + + /* Matrix to store the norm of each element */ + chameleon_desc_init(&Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q, + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg); + + /* Matrix to compute the global frobenius norm */ + chameleon_desc_init(&Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + workmt * 2, worknt, 0, 0, workmt * 2, worknt, A->p, A->q, + NULL, NULL, NULL, NULL); + + chameleon_pzhered_frb( trans, uplo, A, &Wcol, &Welt, &options ); + + CHAMELEON_Desc_Flush(&Wcol, sequence); + CHAMELEON_Desc_Flush(&Welt, sequence); + CHAMELEON_Desc_Flush(A, sequence); + + RUNTIME_sequence_wait(chamctxt, sequence); + + gnorm = *((double *)Welt.get_blkaddr(&Welt, A->myrank / A->q, A->myrank % A->q)); + chameleon_desc_destroy(&Welt); + + /** + * Reduce the precision of the tiles if possible + */ + if (prec < 0.) + { +#if !defined(CHAMELEON_SIMULATION) + eps = LAPACKE_dlamch_work('e'); +#else +#if defined(PRECISION_z) || defined(PRECISION_d) + eps = 1.e-15; +#else + eps = 1.e-7; +#endif +#endif + } + else + { + eps = prec; + } + threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt)); + +#if defined(CHAMELEON_DEBUG_GERED) + fprintf(stderr, + "[%2d] The norm of A is: %e\n" + "[%2d] The requested precision is: %e\n" + "[%2d] The computed threshold is: %e\n", + A->myrank, gnorm, + A->myrank, eps, + A->myrank, threshold); +#endif + for (m = 0; m < A->mt; m++) + { + int tempmm = (m == (A->mt - 1)) ? A->m - m * A->mb : A->mb; + int nmin = (uplo == ChamUpper) ? m : 0; + int nmax = (uplo == ChamLower) ? chameleon_min(m + 1, A->nt) : A->nt; + + for (n = nmin; n < nmax; n++) + { + CHAM_tile_t *tile = A->get_blktile(A, m, n); + + int tempnn = (n == (A->nt - 1)) ? A->n - n * A->nb : A->nb; + + /* + * u_{high} = 1e-16 (later should be application accuracy) + * u_{low} = 1e-8 + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < threshold / u_{low} + */ + + INSERT_TASK_zgered( &options, threshold, + tempmm, tempnn, A( m, n ), W( &Wcol, m, n ) ); + } + } + + CHAMELEON_Desc_Flush(A, sequence); + RUNTIME_sequence_wait(chamctxt, sequence); + + chameleon_desc_destroy(&Wcol); + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zhered.c b/compute/zhered.c new file mode 100644 index 000000000..5603ea68b --- /dev/null +++ b/compute/zhered.c @@ -0,0 +1,173 @@ +/** + * + * @file zhered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2024 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zhered wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Ana Hourcau + * @date 2024-07-17 + * @precisions normal z -> z d + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the synchronous version of CHAMELEON_zheredinit_Tile_Async(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] uplo + * = ChamUpper: Upper triangle of A is stored; + * = ChamLower: Lower triangle of A is stored. + * + * @param[in] A + * On entry, the symmetric positive definite (or Hermitian) matrix A. + * If uplo = ChamUpper, the leading N-by-N upper triangular part of A + * contains the upper triangular part of the matrix A, and the strictly lower triangular + * part of A is not referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower + * triangular part of the matrix A, and the strictly upper triangular part of A is not + * referenced. + * On exit, if return value = 0, the factor U or L from the Cholesky factorization + * A = U^H*U or A = L*L^H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval >0 if i, the leading minor of order i of A is not positive definite, so the + * factorization could not be completed, and the solution has not been computed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zhered + * @sa CHAMELEON_zhered_Tile_Async + * @sa CHAMELEON_cpotrfmp_Tile + * @sa CHAMELEON_dpotrfmp_Tile + * @sa CHAMELEON_spotrfmp_Tile + * @sa CHAMELEON_zpotrs_Tile + * + */ +int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zheredinit_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zhered_Tile_Async( uplo, precision, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the non-blocking equivalent of CHAMELEON_zhered_Tile(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. It may return before the computation is + * finished. This function allows for pipelining operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zhered + * @sa CHAMELEON_zhered_Tile + * @sa CHAMELEON_cpotrfmp_Tile_Async + * @sa CHAMELEON_dpotrfmp_Tile_Async + * @sa CHAMELEON_spotrfmp_Tile_Async + * @sa CHAMELEON_zpotrs_Tile_Async + * + */ +int CHAMELEON_zhered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zhered_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zhered_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zhered_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* + * Quick return + */ + if ( chameleon_max( A->m, A->n ) == 0 ) { + return CHAMELEON_SUCCESS; + } + + chameleon_pzhered( ChamConjTrans, uplo, precision, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index 645018f83..088e03140 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -22,7 +22,8 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-09-08 + * @author Ana Hourcau + * @date 2024-07-17 * @precisions normal z -> c d s * */ @@ -81,6 +82,8 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64 #if defined(PRECISION_z) || defined(PRECISION_d) void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +void chameleon_pzhered( cham_trans_t trans, cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzgerst( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); #endif diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 75e214843..3f33260f4 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -170,6 +170,7 @@ int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); +int CHAMELEON_zhered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A); @@ -249,6 +250,7 @@ int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequen int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zhered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -- GitLab