diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index df41dd109b68835ae2f07759d0bb81b758bfe075..5318928a248fdede8627d1b38d9de7cc2a95a961 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -11,7 +11,7 @@ @author Florent Pruvost @author Nathalie Furmento @author Alycia Lisito - @date 2023-07-04 + @date 2023-07-06 """ _extra_blas = [ @@ -48,6 +48,7 @@ _extra_blas = [ ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), ('', 'sgersum', 'dgersum', 'cgersum', 'zgersum' ), ('', 'sprint', 'dprint', 'cprint', 'zprint' ), + ('', 'sgered', 'dgered', 'cgered', 'zgered' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] @@ -114,6 +115,7 @@ subs = { ('CHAMELEON_p', 'CHAMELEON_s', 'CHAMELEON_d', 'CHAMELEON_c', 'CHAMELEON_z' ), ('RUNTIME_P', 'RUNTIME_s', 'RUNTIME_d', 'RUNTIME_c', 'RUNTIME_z' ), ('chameleon_p', 'chameleon_s', 'chameleon_d', 'chameleon_c', 'chameleon_z' ), + ('codelet_p', 'codelet_ds', 'codelet_ds', 'codelet_zc', r'codelet_zc\b' ), ('codelet_p', 'codelet_s', 'codelet_d', 'codelet_c', 'codelet_z' ), ('runtime_p', 'runtime_s', 'runtime_d', 'runtime_c', 'runtime_z' ), ('testing_p', 'testing_s', 'testing_d', 'testing_c', 'testing_z' ), diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 14eec76587d3ae4bc865e23dd2725625bc0694e9..b9dc95f8bd149c456cd7ec390aa498beeb23eccb 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -27,7 +27,7 @@ # @author Alycia Lisito # @author Loris Lucido # @author Matthieu Kuhn -# @date 2023-01-30 +# @date 2023-07-06 # ### @@ -192,7 +192,9 @@ set(ZSRC # MIXED PRECISION ################## pzlag2c.c + pzgered.c ### + zgered.c #zcgels.c #zcgesv.c #zcposv.c diff --git a/compute/pzgered.c b/compute/pzgered.c new file mode 100644 index 0000000000000000000000000000000000000000..f934c494df6a576e4d2fe52bc2a2091ff9b4827a --- /dev/null +++ b/compute/pzgered.c @@ -0,0 +1,260 @@ +/** + * + * @file pzgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zlange parallel algorithm + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> z d + * + */ +//ALLOC_WS : A->mb +//ALLOC_WS : A->nb +//WS_ADD : A->mb + A->nb +#include "control/common.h" +#include <coreblas/lapacke.h> + +#define A( m, n ) A, (m), (n) +#define W( desc, m, n ) (desc), (m), (n) + +static inline void +chameleon_pzgered_frb( cham_uplo_t uplo, + CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, + RUNTIME_option_t *options ) +{ + double alpha = 1.0; + double beta = 0.0; + + int m, n; + int minMNT = chameleon_min( A->mt, A->nt ); + int minMN = chameleon_min( A->m, A->n ); + int MT = (uplo == ChamUpper) ? minMNT : A->mt; + int NT = (uplo == ChamLower) ? minMNT : A->nt; + int M = (uplo == ChamUpper) ? minMN : A->m; + int N = (uplo == ChamLower) ? minMN : A->n; + int P = Welt->p; + int Q = Welt->q; + + /* Initialize workspaces for tile norms */ + for(m = 0; m < Wnorm->mt; m++) { + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; + + for(n = nmin; n < nmax; n++) { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Wnorm->mb, Wnorm->nb, + alpha, beta, + W( Wnorm, m, n ) ); + } + } + + /* Initialize workspaces */ + for(m = 0; m < Welt->mt; m++) { + for(n = 0; n < Welt->nt; n++) { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Welt->mb, Welt->nb, + alpha, beta, + W( Welt, m, n ) ); + } + } + + /** + * Step 1: + * For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) ) + */ + for(m = 0; m < MT; m++) { + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; + + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + + for(n = nmin; n < nmax; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + if ( (n == m) && (uplo != ChamUpperLower) ) { + INSERT_TASK_ztrssq( + options, + uplo, ChamNonUnit, tempmm, tempnn, + A(m, n), W( Wnorm, m, n) ); + } + else { + INSERT_TASK_zgessq( + options, + ChamEltwise, + tempmm, tempnn, + A(m, n), W( Wnorm, m, n) ); + } + + /* Compress the info per line */ + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) ); + + /* Compute the final norm of the tile */ + INSERT_TASK_dplssq2( + options, 1, W( Wnorm, m, n ) ); + } + + /** + * Step 2: + * For each j, W(m, j) = reduce( Welt(m, 0..Q-1) ) + */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) ); + } + } + + /** + * Step 3: + * For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) ) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) ); + } + + /** + * Step 4: + * For each i, Welt(i, n) = max( Welt(0..P-1, n) ) + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) ); + } + + INSERT_TASK_dplssq2( + options, 1, W( Welt, 0, 0) ); + + /** + * Broadcast the result + */ + for(m = 0; m < A->p; m++) { + for(n = 0; n < A->q; n++) { + if ( (m != 0) || (n != 0) ) { + INSERT_TASK_dlacpy( + options, + ChamUpperLower, 1, 1, + W( Welt, 0, 0 ), W( Welt, m, n ) ); + } + } + } +} + +/** + * + */ +void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t Wcol; + CHAM_desc_t Welt; + double gnorm, lnorm, threshold, eps; + + int workmt, worknt; + int m, n; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + workmt = chameleon_max( A->mt, A->p ); + worknt = chameleon_max( A->nt, A->q ); + + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + /* Matrix to store the norm of each element */ + chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q, + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); + + /* Matrix to compute the global frobenius norm */ + chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + workmt*2, worknt, 0, 0, workmt*2, worknt, A->p, A->q, + NULL, NULL, NULL, NULL ); + + chameleon_pzgered_frb( uplo, A, &Wcol, &Welt, &options ); + + CHAMELEON_Desc_Flush( &Wcol, sequence ); + CHAMELEON_Desc_Flush( &Welt, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + + RUNTIME_sequence_wait( chamctxt, sequence ); + + gnorm = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q )); + chameleon_desc_destroy( &Welt ); + + /** + * Reduce the precision of the tiles if possible + */ + if ( prec < 0. ) { +#if !defined(CHAMELEON_SIMULATION) + double eps = LAPACKE_dlamch_work('e'); +#else +#if defined(PRECISION_z) || defined(PRECISION_d) + double eps = 1.e-15; +#else + double eps = 1.e-7; +#endif +#endif + } + else { + eps = prec; + } + threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt)); + +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] The norm of A is: %e\n" + "[%2d] The requested precision is: %e\n" + "[%2d] The computed threshold is: %e\n", + A->myrank, gnorm, + A->myrank, eps, + A->myrank, threshold ); +#endif + for(m = 0; m < A->mt; m++) { + int tempmm = ( m == (A->mt-1) ) ? A->m - m * A->mb : A->mb; + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, A->nt) : A->nt; + + for(n = nmin; n < nmax; n++) { + CHAM_tile_t *tile = A->get_blktile( A, m, n ); + if ( tile->rank == A->myrank ) { + int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; + + /* Get the frobenius norm of the tile A( m, n ) */ + lnorm = ((double*)((Wcol.get_blktile( &Wcol, m, n ))->mat))[0]; + + /* + * u_{high} = 1e-16 (later should be application accuraccy) + * u_{low} = 1e-8 + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < threshold / u_{low} + */ + INSERT_TASK_zgered( &options, threshold, lnorm, + tempmm, tempnn, A( m, n ) ); + } + } + } + + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + + chameleon_desc_destroy( &Wcol ); + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zgered.c b/compute/zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..7c95e7bd10b68c129545c69034ffecc964abc2ce --- /dev/null +++ b/compute/zgered.c @@ -0,0 +1,172 @@ +/** + * + * @file zgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> z d + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the synchronous version of CHAMELEON_zgeredinit_Tile_Async(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] uplo + * = ChamUpper: Upper triangle of A is stored; + * = ChamLower: Lower triangle of A is stored. + * + * @param[in] A + * On entry, the symmetric positive definite (or Hermitian) matrix A. + * If uplo = ChamUpper, the leading N-by-N upper triangular part of A + * contains the upper triangular part of the matrix A, and the strictly lower triangular + * part of A is not referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower + * triangular part of the matrix A, and the strictly upper triangular part of A is not + * referenced. + * On exit, if return value = 0, the factor U or L from the Cholesky factorization + * A = U^H*U or A = L*L^H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval >0 if i, the leading minor of order i of A is not positive definite, so the + * factorization could not be completed, and the solution has not been computed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgered + * @sa CHAMELEON_zgered_Tile_Async + * @sa CHAMELEON_cpotrfmp_Tile + * @sa CHAMELEON_dpotrfmp_Tile + * @sa CHAMELEON_spotrfmp_Tile + * @sa CHAMELEON_zpotrs_Tile + * + */ +int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgeredinit_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgered_Tile_Async( uplo, precision, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the non-blocking equivalent of CHAMELEON_zgered_Tile(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. It may return before the computation is + * finished. This function allows for pipelining operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgered + * @sa CHAMELEON_zgered_Tile + * @sa CHAMELEON_cpotrfmp_Tile_Async + * @sa CHAMELEON_dpotrfmp_Tile_Async + * @sa CHAMELEON_spotrfmp_Tile_Async + * @sa CHAMELEON_zpotrs_Tile_Async + * + */ +int CHAMELEON_zgered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgered_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgered_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* + * Quick return + */ + if ( chameleon_max( A->m, A->n ) == 0 ) { + return CHAMELEON_SUCCESS; + } + + chameleon_pzgered( uplo, precision, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index 8bec9da5fa4bab356cde2bdb61b7717ffd086fde..5bcb8e0cd97fb7cb99f013080b9c0db1dcfc2ac8 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -22,7 +22,7 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-07-05 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -76,6 +76,10 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64 /** * Declarations of parallel functions (dynamic scheduling) - alphabetical order */ +#if defined(PRECISION_z) || defined(PRECISION_d) +void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +#endif int chameleon_pzgebrd( int genD, cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index fa5af0a740a3af86a8ce25c0e9cc74f72eaf4296..c60da7daeab688694d32fc2ae145b5c6c82bed1a 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -11,7 +11,7 @@ * * @brief Chameleon CHAMELEON_complex64_t wrappers * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak @@ -23,7 +23,7 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -168,6 +168,9 @@ int CHAMELEON_zplrnk_Tile(int K, CHAM_desc_t *C, unsigned long long int seedA, u int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); +#if defined(PRECISION_z) || defined(PRECISION_d) +int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); +#endif int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotrimm_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C); @@ -245,6 +248,9 @@ int CHAMELEON_zplrnk_Tile_Async(int K, CHAM_desc_t *C, unsigned long long int se int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +#if defined(PRECISION_z) || defined(PRECISION_d) +int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +#endif int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index db562075436d82c7ac06759eb0489f376b59d3bf..794d1fe6137c530875956cc556c93186aa51b474 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -11,7 +11,7 @@ * * @brief Chameleon CHAMELEON_Complex64_t elementary tasks header * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak @@ -24,7 +24,7 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-02-21 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -78,6 +78,9 @@ void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *T, int Tm, int Tn ); +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zgessm( const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 5d8113b9b85792e30f0d0aaaf6c22d7504a1b128..d5f0e12ca07f0ef033612e6d62725327a59bd1ea 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -114,6 +114,10 @@ set(CODELETS_ZSRC # Reduction methods ################## codelets/codelet_zgersum.c + ################## + # Precision modification kernels + ################## + codelets/codelet_zgered.c ) set(CODELETS_SRC diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 30ea76045131884c610a0f4ee430393f732f6360..abb4f79aa6d6eb66f3c99454c76c4e5397d3e385 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -243,6 +243,7 @@ set(ZSRC codelets/codelet_zcallback.c codelets/codelet_zccallback.c codelets/codelet_dlag2h.c + codelets/codelet_zgerst.c ${CODELETS_ZSRC} ) diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..394ef74c24ac9023a1b594b348ba21479df48d3a --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgered.c @@ -0,0 +1,142 @@ +/** + * + * @file starpu/codelet_zgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_starpu.h" +#include <coreblas/lapacke.h> +#include "runtime_codelet_zc.h" +#include "runtime_codelet_z.h" + +//#define CHAMELEON_DEBUG_GERED + +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + CHAM_tile_t *tileA; + double u_low; + int64_t mm, nn; +#if defined(CHAMELEON_USE_MPI) + int tag; +#endif + starpu_data_handle_t *handleAin; + starpu_data_handle_t handleAout; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + /* Get the Input handle */ + mm = Am + (A->i / A->mb); + nn = An + (A->j / A->nb); + handleAin = A->schedopt; + handleAin += ((int64_t)A->lmt) * nn + mm; + + assert( *handleAin != NULL ); + + /* + * Lets convert the tile precision based on the following criteria: + * + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < u_{high} * || A ||_F / nt * 1/ u_{low} + * ||A_{i,j}||_F < threshold / u_{low} + */ + + tileA = A->get_blktile( A, Am, An ); +#if defined(CHAMELEON_USE_MPI) + tag = starpu_mpi_data_get_tag( *handleAin ); +#endif /* defined(CHAMELEON_USE_MPI) */ + +#if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500) +#if defined(PRECISION_d) + if ( options->withcuda ) { + /* + * Check for half precision + */ + u_low = 1.e-4; + if ( Anorm < (threshold / u_low) ) { +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] Convert the tile ( %d, %d ) to half precision\n", + A->myrank, Am, An ); +#endif + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf ); + + rt_starpu_insert_task( + &cl_dlag2h, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2h", +#endif + 0); + + starpu_data_unregister_submit( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexHalf; +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( handleAout, tag, tileA->rank ); +#endif + return; + } + } +#endif +#endif + + /* + * Check for single precision + */ +#if !defined(CHAMELEON_SIMULATION) + u_low = LAPACKE_slamch_work('e'); +#else + u_low = 1e-8; +#endif + if ( Anorm < (threshold / u_low) ) { +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] Convert the tile ( %d, %d ) to single precision\n", + A->myrank, Am, An ); +#endif + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); + + rt_starpu_insert_task( + &cl_zlag2c, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zlag2c", +#endif + 0); + + starpu_data_unregister_submit( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexFloat; +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( *handleAin, tag, tileA->rank ); +#endif + return; + } +} diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index f3f89af41aab00bc870fa663f2d8aff4e0dea39b..db7d35daca01a8b1b7fc16abf17fb742dda01e65 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -90,6 +90,10 @@ CODELETS_HEADER(zunmqr); * Auxiliary functions */ CODELETS_HEADER(zgeadd); +#if defined(PRECISION_z) || defined(PRECISION_d) +CODELETS_HEADER(zgered); +#endif +CODELETS_HEADER(zhessq); CODELETS_HEADER(zhe2ge); CODELETS_HEADER(zlascal); CODELETS_HEADER(ztradd);