diff --git a/.gitignore b/.gitignore index 8f6bc547daa6b82071fe009078707683d3c83aa4..a6fc53760369da3fe6ead5acf96181bb418e850b 100644 --- a/.gitignore +++ b/.gitignore @@ -464,6 +464,7 @@ MigrationBackup/ FodyWeavers.xsd # VS Code files for those working on multiple tools +.vscode .vscode/* !.vscode/settings.json !.vscode/tasks.json diff --git a/ChangeLog b/ChangeLog index 237f203273ceef734df52a947d35ff7cd52f8a33..e615c618376b5283cd311809fd3c5721e7d994ea 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,5 +1,8 @@ chameleon-1.3.0 ------------------------------------------------------------------------ + - mixed-precision: introduce descripto with precision adapted to local norms + - Add CHAMELEON_[dz]gered... functions to reduce the precision of the tiles based on a requested accuracy + - Add CHAMELEON_[dz]gerst... functions to restore the original numerical precision of the tiles in a descriptor - types: add support for half precision arithmetic into the data descriptors - cuda: add half precision conversion kernels, and variants of the gemm kernels (hgemm, and gemmex) - descriptors: Add the possibility to pass arguments to the rankof diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index df41dd109b68835ae2f07759d0bb81b758bfe075..a833ae81636299440af4f81b27efe9260497d4a9 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -11,7 +11,7 @@ @author Florent Pruvost @author Nathalie Furmento @author Alycia Lisito - @date 2023-07-04 + @date 2023-07-06 """ _extra_blas = [ @@ -39,6 +39,7 @@ _extra_blas = [ ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), + ('', 'slag2d', 'slag2d', 'clag2z', 'clag2z' ), ('', 'slag2h', 'dlag2h', 'slag2h', 'dlag2h' ), ('', 'hlag2s', 'hlag2d', 'hlag2s', 'hlag2d' ), ('', 'slag2h', 'dlag2h', 'clag2x', 'zlag2x' ), @@ -48,6 +49,8 @@ _extra_blas = [ ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), ('', 'sgersum', 'dgersum', 'cgersum', 'zgersum' ), ('', 'sprint', 'dprint', 'cprint', 'zprint' ), + ('', 'sgered', 'dgered', 'cgered', 'zgered' ), + ('', 'sgerst', 'dgerst', 'cgerst', 'zgerst' ), ] _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ] @@ -114,6 +117,7 @@ subs = { ('CHAMELEON_p', 'CHAMELEON_s', 'CHAMELEON_d', 'CHAMELEON_c', 'CHAMELEON_z' ), ('RUNTIME_P', 'RUNTIME_s', 'RUNTIME_d', 'RUNTIME_c', 'RUNTIME_z' ), ('chameleon_p', 'chameleon_s', 'chameleon_d', 'chameleon_c', 'chameleon_z' ), + ('codelet_p', 'codelet_ds', 'codelet_ds', 'codelet_zc', r'codelet_zc\b' ), ('codelet_p', 'codelet_s', 'codelet_d', 'codelet_c', 'codelet_z' ), ('runtime_p', 'runtime_s', 'runtime_d', 'runtime_c', 'runtime_z' ), ('testing_p', 'testing_s', 'testing_d', 'testing_c', 'testing_z' ), diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 14eec76587d3ae4bc865e23dd2725625bc0694e9..1c7b5a0b2e623d1091819c72004998b1cd673ac1 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -27,7 +27,7 @@ # @author Alycia Lisito # @author Loris Lucido # @author Matthieu Kuhn -# @date 2023-01-30 +# @date 2023-07-06 # ### @@ -192,7 +192,11 @@ set(ZSRC # MIXED PRECISION ################## pzlag2c.c + pzgered.c + pzgerst.c ### + zgered.c + zgerst.c #zcgels.c #zcgesv.c #zcposv.c diff --git a/compute/pzgered.c b/compute/pzgered.c new file mode 100644 index 0000000000000000000000000000000000000000..f934c494df6a576e4d2fe52bc2a2091ff9b4827a --- /dev/null +++ b/compute/pzgered.c @@ -0,0 +1,260 @@ +/** + * + * @file pzgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zlange parallel algorithm + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> z d + * + */ +//ALLOC_WS : A->mb +//ALLOC_WS : A->nb +//WS_ADD : A->mb + A->nb +#include "control/common.h" +#include <coreblas/lapacke.h> + +#define A( m, n ) A, (m), (n) +#define W( desc, m, n ) (desc), (m), (n) + +static inline void +chameleon_pzgered_frb( cham_uplo_t uplo, + CHAM_desc_t *A, CHAM_desc_t *Wnorm, CHAM_desc_t *Welt, + RUNTIME_option_t *options ) +{ + double alpha = 1.0; + double beta = 0.0; + + int m, n; + int minMNT = chameleon_min( A->mt, A->nt ); + int minMN = chameleon_min( A->m, A->n ); + int MT = (uplo == ChamUpper) ? minMNT : A->mt; + int NT = (uplo == ChamLower) ? minMNT : A->nt; + int M = (uplo == ChamUpper) ? minMN : A->m; + int N = (uplo == ChamLower) ? minMN : A->n; + int P = Welt->p; + int Q = Welt->q; + + /* Initialize workspaces for tile norms */ + for(m = 0; m < Wnorm->mt; m++) { + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; + + for(n = nmin; n < nmax; n++) { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Wnorm->mb, Wnorm->nb, + alpha, beta, + W( Wnorm, m, n ) ); + } + } + + /* Initialize workspaces */ + for(m = 0; m < Welt->mt; m++) { + for(n = 0; n < Welt->nt; n++) { + INSERT_TASK_dlaset( + options, + ChamUpperLower, Welt->mb, Welt->nb, + alpha, beta, + W( Welt, m, n ) ); + } + } + + /** + * Step 1: + * For j in [1,Q], Welt(m, j) = reduce( A(m, j+k*Q) ) + */ + for(m = 0; m < MT; m++) { + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, NT) : NT; + + int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb; + + for(n = nmin; n < nmax; n++) { + int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb; + + if ( (n == m) && (uplo != ChamUpperLower) ) { + INSERT_TASK_ztrssq( + options, + uplo, ChamNonUnit, tempmm, tempnn, + A(m, n), W( Wnorm, m, n) ); + } + else { + INSERT_TASK_zgessq( + options, + ChamEltwise, + tempmm, tempnn, + A(m, n), W( Wnorm, m, n) ); + } + + /* Compress the info per line */ + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Wnorm, m, n), W( Welt, m, n%Q) ); + + /* Compute the final norm of the tile */ + INSERT_TASK_dplssq2( + options, 1, W( Wnorm, m, n ) ); + } + + /** + * Step 2: + * For each j, W(m, j) = reduce( Welt(m, 0..Q-1) ) + */ + for(n = 1; n < Q; n++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) ); + } + } + + /** + * Step 3: + * For m in 0..P-1, Welt(m, n) = max( Welt(m..mt[P], n ) ) + */ + for(m = P; m < MT; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) ); + } + + /** + * Step 4: + * For each i, Welt(i, n) = max( Welt(0..P-1, n) ) + */ + for(m = 1; m < P; m++) { + INSERT_TASK_dplssq( + options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) ); + } + + INSERT_TASK_dplssq2( + options, 1, W( Welt, 0, 0) ); + + /** + * Broadcast the result + */ + for(m = 0; m < A->p; m++) { + for(n = 0; n < A->q; n++) { + if ( (m != 0) || (n != 0) ) { + INSERT_TASK_dlacpy( + options, + ChamUpperLower, 1, 1, + W( Welt, 0, 0 ), W( Welt, m, n ) ); + } + } + } +} + +/** + * + */ +void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + CHAM_desc_t Wcol; + CHAM_desc_t Welt; + double gnorm, lnorm, threshold, eps; + + int workmt, worknt; + int m, n; + + chamctxt = chameleon_context_self(); + if ( sequence->status != CHAMELEON_SUCCESS ) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + workmt = chameleon_max( A->mt, A->p ); + worknt = chameleon_max( A->nt, A->q ); + + RUNTIME_options_ws_alloc( &options, 1, 0 ); + + /* Matrix to store the norm of each element */ + chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + A->mt * 2, A->nt, 0, 0, A->mt * 2, A->nt, A->p, A->q, + NULL, NULL, A->get_rankof_init, A->get_rankof_init_arg ); + + /* Matrix to compute the global frobenius norm */ + chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_GLOBAL, ChamRealDouble, 2, 1, 2, + workmt*2, worknt, 0, 0, workmt*2, worknt, A->p, A->q, + NULL, NULL, NULL, NULL ); + + chameleon_pzgered_frb( uplo, A, &Wcol, &Welt, &options ); + + CHAMELEON_Desc_Flush( &Wcol, sequence ); + CHAMELEON_Desc_Flush( &Welt, sequence ); + CHAMELEON_Desc_Flush( A, sequence ); + + RUNTIME_sequence_wait( chamctxt, sequence ); + + gnorm = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q )); + chameleon_desc_destroy( &Welt ); + + /** + * Reduce the precision of the tiles if possible + */ + if ( prec < 0. ) { +#if !defined(CHAMELEON_SIMULATION) + double eps = LAPACKE_dlamch_work('e'); +#else +#if defined(PRECISION_z) || defined(PRECISION_d) + double eps = 1.e-15; +#else + double eps = 1.e-7; +#endif +#endif + } + else { + eps = prec; + } + threshold = (eps * gnorm) / (double)(chameleon_min(A->mt, A->nt)); + +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] The norm of A is: %e\n" + "[%2d] The requested precision is: %e\n" + "[%2d] The computed threshold is: %e\n", + A->myrank, gnorm, + A->myrank, eps, + A->myrank, threshold ); +#endif + for(m = 0; m < A->mt; m++) { + int tempmm = ( m == (A->mt-1) ) ? A->m - m * A->mb : A->mb; + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, A->nt) : A->nt; + + for(n = nmin; n < nmax; n++) { + CHAM_tile_t *tile = A->get_blktile( A, m, n ); + if ( tile->rank == A->myrank ) { + int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; + + /* Get the frobenius norm of the tile A( m, n ) */ + lnorm = ((double*)((Wcol.get_blktile( &Wcol, m, n ))->mat))[0]; + + /* + * u_{high} = 1e-16 (later should be application accuraccy) + * u_{low} = 1e-8 + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < threshold / u_{low} + */ + INSERT_TASK_zgered( &options, threshold, lnorm, + tempmm, tempnn, A( m, n ) ); + } + } + } + + CHAMELEON_Desc_Flush( A, sequence ); + RUNTIME_sequence_wait( chamctxt, sequence ); + + chameleon_desc_destroy( &Wcol ); + RUNTIME_options_ws_free(&options); + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/pzgerst.c b/compute/pzgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..86d01e1688b668878d5d1a432b53705df4d2f84a --- /dev/null +++ b/compute/pzgerst.c @@ -0,0 +1,58 @@ +/** + * + * @file pzgerst.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgerst parallel algorithm + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "control/common.h" + +#define A(m,n) A, m, n +#define B(m,n) B, m, n + +void chameleon_pzgerst( cham_uplo_t uplo, + CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + int m, n; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init(&options, chamctxt, sequence, request); + + for(m = 0; m < A->mt; m++) { + int tempmm = ( m == (A->mt-1) ) ? A->m - m * A->mb : A->mb; + int nmin = ( uplo == ChamUpper ) ? m : 0; + int nmax = ( uplo == ChamLower ) ? chameleon_min(m+1, A->nt) : A->nt; + + for(n = nmin; n < nmax; n++) { + CHAM_tile_t *tile = A->get_blktile( A, m, n ); + + if (( tile->rank == A->myrank ) && + ( tile->flttype != ChamComplexDouble ) ) + { + int tempnn = ( n == (A->nt-1) ) ? A->n - n * A->nb : A->nb; + + INSERT_TASK_zgerst( &options, + tempmm, tempnn, A( m, n ) ); + } + } + } + + RUNTIME_options_finalize(&options, chamctxt); +} diff --git a/compute/zgered.c b/compute/zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..7c95e7bd10b68c129545c69034ffecc964abc2ce --- /dev/null +++ b/compute/zgered.c @@ -0,0 +1,172 @@ +/** + * + * @file zgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered wrappers + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> z d + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the synchronous version of CHAMELEON_zgeredinit_Tile_Async(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. + * + ******************************************************************************* + * + * @param[in] uplo + * = ChamUpper: Upper triangle of A is stored; + * = ChamLower: Lower triangle of A is stored. + * + * @param[in] A + * On entry, the symmetric positive definite (or Hermitian) matrix A. + * If uplo = ChamUpper, the leading N-by-N upper triangular part of A + * contains the upper triangular part of the matrix A, and the strictly lower triangular + * part of A is not referenced. + * If UPLO = 'L', the leading N-by-N lower triangular part of A contains the lower + * triangular part of the matrix A, and the strictly upper triangular part of A is not + * referenced. + * On exit, if return value = 0, the factor U or L from the Cholesky factorization + * A = U^H*U or A = L*L^H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval >0 if i, the leading minor of order i of A is not positive definite, so the + * factorization could not be completed, and the solution has not been computed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zgered + * @sa CHAMELEON_zgered_Tile_Async + * @sa CHAMELEON_cpotrfmp_Tile + * @sa CHAMELEON_dpotrfmp_Tile + * @sa CHAMELEON_spotrfmp_Tile + * @sa CHAMELEON_zpotrs_Tile + * + */ +int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double precision, CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgeredinit_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgered_Tile_Async( uplo, precision, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Computes the Cholesky factorization of a symmetric positive definite + * or Hermitian positive definite matrix with mixed precision. + * + * This is the non-blocking equivalent of CHAMELEON_zgered_Tile(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. It may return before the computation is + * finished. This function allows for pipelining operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgered + * @sa CHAMELEON_zgered_Tile + * @sa CHAMELEON_cpotrfmp_Tile_Async + * @sa CHAMELEON_dpotrfmp_Tile_Async + * @sa CHAMELEON_spotrfmp_Tile_Async + * @sa CHAMELEON_zpotrs_Tile_Async + * + */ +int CHAMELEON_zgered_Tile_Async( cham_uplo_t uplo, double precision, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgered_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgered_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + /* Check input arguments */ + if (A->nb != A->mb) { + chameleon_error("CHAMELEON_zgered_Tile_Async", "only square tiles supported"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + /* + * Quick return + */ + if ( chameleon_max( A->m, A->n ) == 0 ) { + return CHAMELEON_SUCCESS; + } + + chameleon_pzgered( uplo, precision, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/compute/zgerst.c b/compute/zgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..8d283a6f8a15371f4d9e15079834135bb1180eaa --- /dev/null +++ b/compute/zgerst.c @@ -0,0 +1,148 @@ +/** + * + * @file zgerst.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon auxiliary routines to restore the original precision of a matrix. + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Yuxi Hong + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "control/common.h" + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile + * + * @brief Restore the original precision of a given matrix that may have been + * used in reduced precision during some computations. See + * CHAMELEON_zgered_Tile() to introduce mixed-precision tiles into the matrix. + * + ******************************************************************************* + * + * @param[in] uplo + * Specifies the shape of the matrix A: + * = ChamUpper: A is upper triangular; + * = ChamLower: A is lower triangular; + * = ChamUpperLower: A is general. + * + * @param[in] A + * Descriptor of the CHAMELEON matrix to restore. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * + ******************************************************************************* + * + * @sa CHAMELEON_zgered_Tile + * @sa CHAMELEON_zgered_Tile_Async + * @sa CHAMELEON_zgerst_Tile + * @sa CHAMELEON_cgerst_Tile + * @sa CHAMELEON_dgerst_Tile + * @sa CHAMELEON_sgerst_Tile + * + */ +int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, + CHAM_desc_t *A ) +{ + CHAM_context_t *chamctxt; + RUNTIME_sequence_t *sequence = NULL; + RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + int status; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgerst_Tile", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + chameleon_sequence_create( chamctxt, &sequence ); + + CHAMELEON_zgerst_Tile_Async( uplo, A, sequence, &request ); + + CHAMELEON_Desc_Flush( A, sequence ); + + chameleon_sequence_wait( chamctxt, sequence ); + status = sequence->status; + chameleon_sequence_destroy( chamctxt, sequence ); + return status; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t_Tile_Async + * + * @brief Restore the original precision of a given matrix that may have been + * used in reduced precision during some computations. See + * CHAMELEON_zgered_Tile() to introduce mixed-precision tiles into the matrix. + * + * This is the non-blocking equivalent of CHAMELEON_zgerst_Tile(). It + * operates on matrices stored by tiles with tiles of potentially different + * precisions. All matrices are passed through descriptors. All dimensions are + * taken from the descriptors. It may return before the computation is + * finished. This function allows for pipelining operations at runtime. + * + ******************************************************************************* + * + * @param[in] sequence + * Identifies the sequence of function calls that this call belongs to + * (for completion checks and exception handling purposes). + * + * @param[out] request + * Identifies this function call (for exception handling purposes). + * + ******************************************************************************* + * + * @sa CHAMELEON_zgerst_Tile + * @sa CHAMELEON_zgered_Tile + * @sa CHAMELEON_zgered_Tile_Async + * + */ +int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, + CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, + RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + + chamctxt = chameleon_context_self(); + if (chamctxt == NULL) { + chameleon_fatal_error("CHAMELEON_zgerst_Tile_Async", "CHAMELEON not initialized"); + return CHAMELEON_ERR_NOT_INITIALIZED; + } + if (sequence == NULL) { + chameleon_fatal_error("CHAMELEON_zgerst_Tile_Async", "NULL sequence"); + return CHAMELEON_ERR_UNALLOCATED; + } + if (request == NULL) { + chameleon_fatal_error("CHAMELEON_zgerst_Tile_Async", "NULL request"); + return CHAMELEON_ERR_UNALLOCATED; + } + /* Check sequence status */ + if (sequence->status == CHAMELEON_SUCCESS) { + request->status = CHAMELEON_SUCCESS; + } + else { + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED); + } + + /* Check descriptors for correctness */ + if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) { + chameleon_error("CHAMELEON_zgerst_Tile_Async", "invalid descriptor"); + return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); + } + + chameleon_pzgerst( uplo, A, sequence, request ); + + return CHAMELEON_SUCCESS; +} diff --git a/control/compute_z.h b/control/compute_z.h index 8bec9da5fa4bab356cde2bdb61b7717ffd086fde..06eae17b508c012918bbd011bad9cbb25a7bb7d4 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -22,7 +22,7 @@ * @author Alycia Lisito * @author Matthieu Kuhn * @author Lionel Eyraud-Dubois - * @date 2023-07-05 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -76,6 +76,12 @@ int chameleon_zshift(CHAM_context_t *chamctxt, int m, int n, CHAMELEON_Complex64 /** * Declarations of parallel functions (dynamic scheduling) - alphabetical order */ +#if defined(PRECISION_z) || defined(PRECISION_d) +void chameleon_pzgered( cham_uplo_t uplo, double prec, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +void chameleon_pzgerst( cham_uplo_t uplo, CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +#endif int chameleon_pzgebrd( int genD, cham_job_t jobu, cham_job_t jobvt, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D, CHAMELEON_Complex64_t *U, int LDU, CHAMELEON_Complex64_t *VT, int LDVT, diff --git a/control/descriptor.c b/control/descriptor.c index 1436ad25d7c195bfa173b8f5ed2e70cf4c3ad2a2..e52f276366038554761d3a9fe2f7616789addc1f 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -19,7 +19,7 @@ * @author Raphael Boucherie * @author Samuel Thibault * @author Lionel Eyraud-Dubois - * @date 2023-07-05 + * @date 2023-07-06 * *** * @@ -500,12 +500,6 @@ int CHAMELEON_Desc_Create( CHAM_desc_t **descptr, void *mat, cham_flttype_t dtyp { blkrankof_fct_t get_rankof = NULL; - /* if (getenv("CHAMELEON_1Dx1D_DISTRIBUTION")){ */ - /* printf("[CHAMELEON] : Using 1Dx1D distribubtion\n"); */ - /* get_rankof = chameleon_getrankof_custom; */ - /* load_dist(&custom_dist_a, m, n, mb, nb); */ - /* } */ - return CHAMELEON_Desc_Create_User( descptr, mat, dtyp, mb, nb, bsiz, lm, ln, i, j, m, n, p, q, NULL, NULL, get_rankof, NULL ); diff --git a/gpucublas/include/CMakeLists.txt b/gpucublas/include/CMakeLists.txt index b208d377e5b85e2549153728182caa53c9ffa2ef..2899bcf14ddcefaeacb1fcf7a48afedad36851cb 100644 --- a/gpucublas/include/CMakeLists.txt +++ b/gpucublas/include/CMakeLists.txt @@ -17,10 +17,10 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Florent Pruvost # @author Mathieu Faverge -# @date 2022-02-22 +# @date 2023-07-06 # ### @@ -29,6 +29,7 @@ set(GPUCUBLAS_HDRS_GENERATED "") set(ZHDR gpucublas/gpucublas_z.h + gpucublas/gpucublas_zc.h ) precisions_rules_py( GPUCUBLAS_HDRS_GENERATED "${ZHDR}" diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h index 2305b4f9be07ee8f06d2ce6154fd1d55c0ece938..e44a7e094278010192a979e2b831f57a5ee5a849 100644 --- a/gpucublas/include/gpucublas.h +++ b/gpucublas/include/gpucublas.h @@ -61,6 +61,8 @@ BEGIN_C_DECLS #include "gpucublas/gpucublas_d.h" #include "gpucublas/gpucublas_c.h" #include "gpucublas/gpucublas_s.h" +#include "gpucublas/gpucublas_zc.h" +#include "gpucublas/gpucublas_ds.h" int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h index 0773e003598a9a72597355dc71d5f785ec48af70..cf655f41fa47511b06a4c60d11e6397a8b6d9514 100644 --- a/gpucublas/include/gpucublas/gpucublas_z.h +++ b/gpucublas/include/gpucublas/gpucublas_z.h @@ -11,10 +11,10 @@ * * @brief Chameleon GPU CHAMELEON_Complex64_t kernels header * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -24,6 +24,8 @@ /** * Declarations of cuda kernels - alphabetical order */ +int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle ); +int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle ); int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); diff --git a/gpucublas/include/gpucublas/gpucublas_zc.h b/gpucublas/include/gpucublas/gpucublas_zc.h new file mode 100644 index 0000000000000000000000000000000000000000..48f7073da82b211b806d9ccb83961900b7fb6535 --- /dev/null +++ b/gpucublas/include/gpucublas/gpucublas_zc.h @@ -0,0 +1,24 @@ +/** + * + * @file gpucublas_zc.h + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon GPU Mixed-precision kernels header + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions mixed zc -> zc ds + * + */ +#ifndef _gpucublas_zc_h_ +#define _gpucublas_zc_h_ + +int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle ); + +#endif /* _gpucublas_zc_h_ */ diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index fa5af0a740a3af86a8ce25c0e9cc74f72eaf4296..fa5f069e6057cb0f79eb47a7f0e2bbc38777a14b 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -11,7 +11,7 @@ * * @brief Chameleon CHAMELEON_complex64_t wrappers * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak @@ -23,7 +23,7 @@ * @author Florent Pruvost * @author Alycia Lisito * @author Matthieu Kuhn - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -168,6 +168,10 @@ int CHAMELEON_zplrnk_Tile(int K, CHAM_desc_t *C, unsigned long long int seedA, u int CHAMELEON_zpoinv_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zposv_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B); int CHAMELEON_zpotrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); +#if defined(PRECISION_z) || defined(PRECISION_d) +int CHAMELEON_zgered_Tile( cham_uplo_t uplo, double prec, CHAM_desc_t *A ); +int CHAMELEON_zgerst_Tile( cham_uplo_t uplo, CHAM_desc_t *A ); +#endif int CHAMELEON_zsytrf_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotri_Tile(cham_uplo_t uplo, CHAM_desc_t *A); int CHAMELEON_zpotrimm_Tile(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C); @@ -245,6 +249,10 @@ int CHAMELEON_zplrnk_Tile_Async(int K, CHAM_desc_t *C, unsigned long long int se int CHAMELEON_zpoinv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zposv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +#if defined(PRECISION_z) || defined(PRECISION_d) +int CHAMELEON_zgered_Tile_Async(cham_uplo_t uplo, double prec, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zgerst_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); +#endif int CHAMELEON_zsytrf_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotri_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in index 34b3f3febe2558bd3eca466e7b9fcdd21d8dbeb1..3b5ceb7fa3e00d5571aca777ac1854d2a33a28c3 100644 --- a/include/chameleon/config.h.in +++ b/include/chameleon/config.h.in @@ -57,6 +57,14 @@ #cmakedefine CHAMELEON_USE_CUBLAS #cmakedefine CHAMELEON_USE_HIP +/* Arithmetics support */ +#cmakedefine CHAMELEON_PREC_S +#cmakedefine CHAMELEON_PREC_D +#cmakedefine CHAMELEON_PREC_C +#cmakedefine CHAMELEON_PREC_Z +#cmakedefine CHAMELEON_PREC_DS +#cmakedefine CHAMELEON_PREC_ZC + /* Hmat-oss */ #cmakedefine CHAMELEON_USE_HMAT diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h index b7281f13566b0c9a4ed274d7a9be762a5c0f3cc9..bc7a59e6f0b36ae28602218d6579c1f47ddbf142 100644 --- a/include/chameleon/tasks.h +++ b/include/chameleon/tasks.h @@ -11,11 +11,11 @@ * * @brief Chameleon elementary tasks main header * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * */ #ifndef _chameleon_tasks_h_ @@ -100,6 +100,27 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An, cham_unary_operator_t op_fct, void *op_args ); +void INSERT_TASK_gemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ); + +void INSERT_TASK_gemmex( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ); + +void INSERT_TASK_hgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ); + #include "chameleon/tasks_z.h" #include "chameleon/tasks_d.h" #include "chameleon/tasks_c.h" diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index db562075436d82c7ac06759eb0489f376b59d3bf..b58895aa4346b597fbd7dece2606df7a6de35fd3 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -11,7 +11,7 @@ * * @brief Chameleon CHAMELEON_Complex64_t elementary tasks header * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Jakub Kurzak @@ -24,7 +24,7 @@ * @author Alycia Lisito * @author Romain Peressoni * @author Matthieu Kuhn - * @date 2023-02-21 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -78,6 +78,12 @@ void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options, int m, int n, int ib, int nb, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *T, int Tm, int Tn ); +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ); +void INSERT_TASK_zgerst( const RUNTIME_option_t *options, + int m, int n, + const CHAM_desc_t *A, int Am, int An ); void INSERT_TASK_zgessm( const RUNTIME_option_t *options, int m, int n, int k, int ib, int nb, int *IPIV, diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 5d8113b9b85792e30f0d0aaaf6c22d7504a1b128..fc1aac3355eb6fda653647270b5103280117c0d9 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -114,6 +114,11 @@ set(CODELETS_ZSRC # Reduction methods ################## codelets/codelet_zgersum.c + ################## + # Precision modification kernels + ################## + codelets/codelet_zgered.c + codelets/codelet_zgerst.c ) set(CODELETS_SRC diff --git a/runtime/openmp/codelets/codelet_zgered.c b/runtime/openmp/codelets/codelet_zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..4ccc67ba8741495d294ba45c66d4db111118e5bb --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgered.c @@ -0,0 +1,34 @@ +/** + * + * @file openmp/codelet_zgered.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered OpenMP codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_openmp.h" + +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gered kernel is not available with OpenMP\n" ); + + (void)options; + (void)threshold; + (void)Anorm; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/openmp/codelets/codelet_zgerst.c b/runtime/openmp/codelets/codelet_zgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..97a9d65718d192c9be9b73b35b4c69ae19aae71c --- /dev/null +++ b/runtime/openmp/codelets/codelet_zgerst.c @@ -0,0 +1,32 @@ +/** + * + * @file openmp/codelet_zgerst.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgerst OpenMP codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_openmp.h" + +void INSERT_TASK_zgerst( const RUNTIME_option_t *options, + int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gerst kernel is not available with OpenMP\n" ); + + (void)options; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/openmp/codelets/codelet_zgersum.c b/runtime/openmp/codelets/codelet_zgersum.c index 3ce42080a47a1122d83fe35aa921592ae275f54b..83c479a3983932742b5a96d231c661c1daa502f9 100644 --- a/runtime/openmp/codelets/codelet_zgersum.c +++ b/runtime/openmp/codelets/codelet_zgersum.c @@ -1,6 +1,6 @@ /** * - * @file starpu/codelet_zgersum.c + * @file openmp/codelet_zgersum.c * * @copyright 2009-2014 The University of Tennessee and The University of * Tennessee Research Foundation. All rights reserved. diff --git a/runtime/openmp/control/runtime_async.c b/runtime/openmp/control/runtime_async.c index 64cccaa4b8ebade256a98b3e294b602ed7849520..764612d4b8c55ca0fd1cfbb8ff7f65fbb7e64815 100644 --- a/runtime/openmp/control/runtime_async.c +++ b/runtime/openmp/control/runtime_async.c @@ -9,7 +9,7 @@ * *** * - * @brief Chameleon StarPU asynchronous routines + * @brief Chameleon OpenMP asynchronous routines * * @version 1.2.0 * @author Mathieu Faverge diff --git a/runtime/openmp/control/runtime_context.c b/runtime/openmp/control/runtime_context.c index 05743743c3082330d891ed491d74ec7045f0449f..5a1a1f917a059ec3d327ef4a13e6e0734922d043 100644 --- a/runtime/openmp/control/runtime_context.c +++ b/runtime/openmp/control/runtime_context.c @@ -9,7 +9,7 @@ * *** * - * @brief Chameleon StarPU context routines + * @brief Chameleon OpenMP context routines * * @version 1.2.0 * @author Cedric Augonnet diff --git a/runtime/openmp/control/runtime_control.c b/runtime/openmp/control/runtime_control.c index 08830268f1f4677951bf1d0291265ce63282bd5a..8aa704f33fbbc5ae7a7a25145415cc55455ac221 100644 --- a/runtime/openmp/control/runtime_control.c +++ b/runtime/openmp/control/runtime_control.c @@ -9,7 +9,7 @@ * *** * - * @brief Chameleon StarPU control routines + * @brief Chameleon OpenMP control routines * * @version 1.2.0 * @author Mathieu Faverge diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c index 3c10e5801060670ea28b31631ef270cc9028516d..3f52b69331786bc9ddfd59979c641e3b2b9002a1 100644 --- a/runtime/openmp/control/runtime_options.c +++ b/runtime/openmp/control/runtime_options.c @@ -9,7 +9,7 @@ * *** * - * @brief Chameleon StarPU options routines + * @brief Chameleon OpenMP options routines * * @version 1.2.0 * @author Cedric Augonnet diff --git a/runtime/openmp/control/runtime_profiling.c b/runtime/openmp/control/runtime_profiling.c index e1c15de64fa7bdb6845ccaa2984f2fe5ee43a204..f632fbe9cb7b00a79f95345169bb55a9becec8ee 100644 --- a/runtime/openmp/control/runtime_profiling.c +++ b/runtime/openmp/control/runtime_profiling.c @@ -9,7 +9,7 @@ * *** * - * @brief Chameleon StarPU profiling routines + * @brief Chameleon OpenMP profiling routines * * @version 1.2.0 * @author Cedric Augonnet diff --git a/runtime/parsec/codelets/codelet_zgered.c b/runtime/parsec/codelets/codelet_zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..b4fc056018fa7e8a7fe623a60dc409a15a245251 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgered.c @@ -0,0 +1,34 @@ +/** + * + * @file parsec/codelet_zgered.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered PaRSEC codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_parsec.h" + +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gered kernel is not available with PaRSEC\n" ); + + (void)options; + (void)threshold; + (void)Anorm; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/parsec/codelets/codelet_zgerst.c b/runtime/parsec/codelets/codelet_zgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..134dc20c7cb88eec4215639c17f186e1d5eb80c4 --- /dev/null +++ b/runtime/parsec/codelets/codelet_zgerst.c @@ -0,0 +1,32 @@ +/** + * + * @file parsec/codelet_zgerst.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgerst PaRSEC codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_parsec.h" + +void INSERT_TASK_zgerst( const RUNTIME_option_t *options, + int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gerst kernel is not available with PaRSEC\n" ); + + (void)options; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/parsec/codelets/codelet_zgersum.c b/runtime/parsec/codelets/codelet_zgersum.c index 4ab4c6208da83bf9d9923c73faaddc3368c05dd9..0d68ff0601d051b7534038346964c12e334cce39 100644 --- a/runtime/parsec/codelets/codelet_zgersum.c +++ b/runtime/parsec/codelets/codelet_zgersum.c @@ -1,6 +1,6 @@ /** * - * @file starpu/codelet_zgersum.c + * @file parsec/codelet_zgersum.c * * @copyright 2009-2014 The University of Tennessee and The University of * Tennessee Research Foundation. All rights reserved. diff --git a/runtime/quark/codelets/codelet_zgered.c b/runtime/quark/codelets/codelet_zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..d1574eaee84b6178474ffe1dca36d57119ae6628 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgered.c @@ -0,0 +1,34 @@ +/** + * + * @file quark/codelet_zgered.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered Quark codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_quark.h" + +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gered kernel is not available with Quark\n" ); + + (void)options; + (void)threshold; + (void)Anorm; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/quark/codelets/codelet_zgerst.c b/runtime/quark/codelets/codelet_zgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..f717011dfca2091bb14e6a903902dff04bbed940 --- /dev/null +++ b/runtime/quark/codelets/codelet_zgerst.c @@ -0,0 +1,32 @@ +/** + * + * @file quark/codelet_zgerst.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgerst Quark codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_quark.h" + +void INSERT_TASK_zgerst( const RUNTIME_option_t *options, + int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + fprintf( stderr, "WARNING: gerst kernel is not available with Quark\n" ); + + (void)options; + (void)m; + (void)n; + (void)A; + (void)Am; + (void)An; +} diff --git a/runtime/quark/codelets/codelet_zgersum.c b/runtime/quark/codelets/codelet_zgersum.c index 60534e23c920d43ba2c4f9493206c53239309914..0d2525112406a72de037ac859fb333c5a97a4fd1 100644 --- a/runtime/quark/codelets/codelet_zgersum.c +++ b/runtime/quark/codelets/codelet_zgersum.c @@ -1,6 +1,6 @@ /** * - * @file starpu/codelet_zgersum.c + * @file quark/codelet_zgersum.c * * @copyright 2009-2014 The University of Tennessee and The University of * Tennessee Research Foundation. All rights reserved. diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index f7c00783b3e6ec08c10edf41ef9530857810c4de..30ea76045131884c610a0f4ee430393f732f6360 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -26,7 +26,7 @@ # @author Matthieu Kuhn # @author Loris Lucido # @author Terry Cojean -# @date 2023-01-30 +# @date 2023-07-06 # ### cmake_minimum_required(VERSION 3.1) @@ -242,6 +242,7 @@ set(RUNTIME_SRCS_GENERATED "") set(ZSRC codelets/codelet_zcallback.c codelets/codelet_zccallback.c + codelets/codelet_dlag2h.c ${CODELETS_ZSRC} ) @@ -249,6 +250,13 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" PRECISIONS "${CHAMELEON_PRECISION}" TARGETDIR "codelets") +set(CODELETS_SRC + codelets/codelet_convert.c + codelets/codelet_hgemm.c + codelets/codelet_gemm.c + ${CODELETS_SRC} + ) + set(RUNTIME_SRCS ${RUNTIME_COMMON} ${RUNTIME_SRCS_GENERATED} diff --git a/runtime/starpu/codelets/codelet_convert.c b/runtime/starpu/codelets/codelet_convert.c new file mode 100644 index 0000000000000000000000000000000000000000..0b07c069a1d509dc2fbbc09bd97f611154aa8ee0 --- /dev/null +++ b/runtime/starpu/codelets/codelet_convert.c @@ -0,0 +1,133 @@ +/** + * + * @file starpu/codelet_convert.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgemm StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" +#define PRECISION_z +#include "runtime_codelet_z.h" +#undef PRECISION_z +#define PRECISION_d +#include "runtime_codelet_d.h" +#undef PRECISION_d +#define PRECISION_c +#include "runtime_codelet_c.h" +#undef PRECISION_c +#define PRECISION_s +#include "runtime_codelet_s.h" +#undef PRECISION_s +#define PRECISION_zc +#include "runtime_codelet_zc.h" +#undef PRECISION_zc +#define PRECISION_ds +#include "runtime_codelet_ds.h" +#undef PRECISION_ds + +void +insert_task_convert( const RUNTIME_option_t *options, + int m, int n, + cham_flttype_t fromtype, + starpu_data_handle_t fromtile, + cham_flttype_t totype, + starpu_data_handle_t totile ) +{ + struct starpu_codelet *codelet = NULL; + void (*callback)(void*) = NULL; + + int conversion = ChamConvert( fromtype, totype ); + + switch( conversion ) { +#if defined(CHAMELEON_PREC_ZC) + case ChamConvertComplexDoubleToSingle: + codelet = &cl_zlag2c; + callback = cl_zlag2c_callback; + break; + + case ChamConvertComplexSingleToDouble: + codelet = &cl_clag2z; + callback = cl_clag2z_callback; + break; +#endif + +#if defined(CHAMELEON_PREC_DS) + case ChamConvertRealDoubleToSingle: + codelet = &cl_dlag2s; + callback = cl_dlag2s_callback; + break; + + case ChamConvertRealSingleToDouble: + codelet = &cl_slag2d; + callback = cl_slag2d_callback; + break; +#endif + +#if defined(CHAMELEON_PREC_D) && defined(CHAMELON_USE_CUDA) + case ChamConvertRealDoubleToHalf: + codelet = &cl_dlag2h; + callback = cl_dlag2h_callback; + break; + + case ChamConvertRealHalfToDouble: + codelet = &cl_hlag2d; + callback = cl_hlag2d_callback; + break; +#endif + +#if defined(CHAMELEON_PREC_S) && defined(CHAMELON_USE_CUDA) + case ChamConvertRealSingleToHalf: + codelet = &cl_slag2h; + callback = cl_slag2h_callback; + break; + + case ChamConvertRealHalfToSingle: + codelet = &cl_hlag2s; + callback = cl_hlag2s_callback; + break; +#endif + + case ChamConvertComplexDoubleToDouble: + return; + case ChamConvertComplexSingleToSingle: + return; + case ChamConvertComplexHalfToHalf: + return; + case ChamConvertRealDoubleToDouble: + return; + case ChamConvertRealSingleToSingle: + return; + case ChamConvertRealHalfToHalf: + return; + + default: + assert(0); + fprintf( stderr, "INSERT_TASK_convert: Unknown conversion type\n" ); + return; + } + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, fromtile, + STARPU_W, totile, + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, options->profiling ? callback : NULL, + STARPU_EXECUTE_ON_WORKER, options->workerid, + 0); + + return; +} diff --git a/runtime/starpu/codelets/codelet_dlag2h.c b/runtime/starpu/codelets/codelet_dlag2h.c new file mode 100644 index 0000000000000000000000000000000000000000..b910559db38c20a147e23173c884ea5364d923e1 --- /dev/null +++ b/runtime/starpu/codelets/codelet_dlag2h.c @@ -0,0 +1,161 @@ +/** + * + * @file starpu/codelet_dlag2h.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2h StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @author Lucas Barros de Assis + * @author Florent Pruvost + * @author Samuel Thibault + * @date 2023-07-06 + * @precisions normal d -> d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_d.h" + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_dlag2h_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_dlag2h( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_dlag2h failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( dlag2h, NULL, cl_dlag2h_cuda_func, STARPU_CUDA_ASYNC ) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_dlag2h( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_dlag2h; + void (*callback)(void*) = options->profiling ? cl_dlag2h_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamRealDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2h", +#endif + 0); +} + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_hlag2d_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_hlag2d( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_hlag2d failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( hlag2d, NULL, cl_hlag2d_cuda_func, STARPU_CUDA_ASYNC ) + +void INSERT_TASK_hlag2d( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_hlag2d; + void (*callback)(void*) = options->profiling ? cl_hlag2d_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamComplexFloat, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "hlag2d", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_dlag2z.c b/runtime/starpu/codelets/codelet_dlag2z.c index 19516c88cdff7174efdd18f64fe9d9f3125ad581..34e0b16c0e29207f514110cb9333876e40524319 100644 --- a/runtime/starpu/codelets/codelet_dlag2z.c +++ b/runtime/starpu/codelets/codelet_dlag2z.c @@ -11,9 +11,9 @@ * * @brief Chameleon dlag2z StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c * */ @@ -65,8 +65,8 @@ void INSERT_TASK_dlag2z( const RUNTIME_option_t *options, STARPU_VALUE, &uplo, sizeof(uplo), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, double, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamRealDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_dzasum.c b/runtime/starpu/codelets/codelet_dzasum.c index a085f919427e9cc470a9ff9eb3758bb2ebfc3d60..1fc2cf3090bf046b8ffdda4dd021619ce2d08090 100644 --- a/runtime/starpu/codelets/codelet_dzasum.c +++ b/runtime/starpu/codelets/codelet_dzasum.c @@ -11,14 +11,14 @@ * * @brief Chameleon dzasum StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -67,8 +67,8 @@ void INSERT_TASK_dzasum( const RUNTIME_option_t *options, STARPU_VALUE, &uplo, sizeof(cham_uplo_t), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, double, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_gemm.c b/runtime/starpu/codelets/codelet_gemm.c new file mode 100644 index 0000000000000000000000000000000000000000..69cd67a294d2fcdf116e28326072702527a8d699 --- /dev/null +++ b/runtime/starpu/codelets/codelet_gemm.c @@ -0,0 +1,207 @@ +/** + * + * @file starpu/codelet_gemm.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon gemm StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" +#define PRECISION_z +#include "runtime_codelet_z.h" +#undef PRECISION_z +#define PRECISION_d +#include "runtime_codelet_d.h" +#undef PRECISION_d +#define PRECISION_c +#include "runtime_codelet_c.h" +#undef PRECISION_c +#define PRECISION_s +#include "runtime_codelet_s.h" +#undef PRECISION_s + +void +INSERT_TASK_gemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + struct starpu_codelet *codelet = NULL; + void (*callback)(void*) = NULL; + + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + void *clargs = NULL; + int accessC; + int exec = 0; + size_t argssize = 0; + char *cl_name = "Xgemm"; + CHAM_tile_t *tileC; + cham_flttype_t Cflttype; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + + tileC = C->get_blktile( C, Cm, Cn ); + Cflttype = tileC->flttype; + + switch( Cflttype ) { +#if defined(CHAMELEON_PREC_Z) + case ChamComplexDouble: + codelet = &cl_zgemm; + callback = cl_zgemm_callback; + if ( exec ) { + struct cl_zgemm_args_s *cl_zargs; + cl_zargs = malloc( sizeof( struct cl_zgemm_args_s ) ); + cl_zargs->transA = transA; + cl_zargs->transB = transB; + cl_zargs->m = m; + cl_zargs->n = n; + cl_zargs->k = k; + cl_zargs->alpha = alpha; + cl_zargs->beta = beta; + clargs = (void*)cl_zargs; + argssize = sizeof( struct cl_zgemm_args_s ); + } + break; +#endif +#if defined(CHAMELEON_PREC_C) + case ChamComplexSingle: + codelet = &cl_cgemm; + callback = cl_cgemm_callback; + if ( exec ) { + struct cl_cgemm_args_s *cl_cargs; + cl_cargs = malloc( sizeof( struct cl_cgemm_args_s ) ); + cl_cargs->transA = transA; + cl_cargs->transB = transB; + cl_cargs->m = m; + cl_cargs->n = n; + cl_cargs->k = k; + cl_cargs->alpha = alpha; + cl_cargs->beta = beta; + clargs = (void*)cl_cargs; + argssize = sizeof( struct cl_cgemm_args_s ); + } + break; +#endif +#if defined(CHAMELEON_PREC_D) + case ChamRealDouble: + codelet = &cl_dgemm; + callback = cl_dgemm_callback; + if ( exec ) { + struct cl_dgemm_args_s *cl_dargs; + cl_dargs = malloc( sizeof( struct cl_dgemm_args_s ) ); + cl_dargs->transA = transA; + cl_dargs->transB = transB; + cl_dargs->m = m; + cl_dargs->n = n; + cl_dargs->k = k; + cl_dargs->alpha = alpha; + cl_dargs->beta = beta; + clargs = (void*)cl_dargs; + argssize = sizeof( struct cl_dgemm_args_s ); + } + break; +#endif +#if defined(CHAMELEON_PREC_S) + case ChamRealSingle: + codelet = &cl_sgemm; + callback = cl_sgemm_callback; + if ( exec ) { + struct cl_sgemm_args_s *cl_sargs; + cl_sargs = malloc( sizeof( struct cl_sgemm_args_s ) ); + cl_sargs->transA = transA; + cl_sargs->transB = transB; + cl_sargs->m = m; + cl_sargs->n = n; + cl_sargs->k = k; + cl_sargs->alpha = alpha; + cl_sargs->beta = beta; + clargs = (void*)cl_sargs; + argssize = sizeof( struct cl_sgemm_args_s ); + } + break; +#endif +#if (defined(CHAMELEON_PREC_D) || defined(CHAMELEON_PREC_S)) && defined(CHAMELEON_USE_CUDA) + case ChamRealHalf: + codelet = &cl_hgemm; + callback = cl_hgemm_callback; + if ( exec ) { + struct cl_hgemm_args_s *cl_hargs; + cl_hargs = malloc( sizeof( struct cl_hgemm_args_s ) ); + cl_hargs->transA = transA; + cl_hargs->transB = transB; + cl_hargs->m = m; + cl_hargs->n = n; + cl_hargs->k = k; + cl_hargs->alpha = alpha; + cl_hargs->beta = beta; + clargs = (void*)cl_hargs; + argssize = sizeof( struct cl_hgemm_args_s ); + } + break; +#endif + default: + fprintf( stderr, "INSERT_TASK_gemm: Unknown datatype %d (Mixed=%3s, Type=%d, Size=%d\n", + Cflttype, cham_is_mixed(Cflttype) ? "Yes" : "No", + cham_get_ftype(Cflttype), cham_get_arith(Cflttype) ); + return; + } + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Callback for profiling information */ + callback = options->profiling ? callback : NULL; + + /* Insert the task */ + rt_starpu_insert_task( + codelet, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, argssize, + + /* Task handles */ + STARPU_R, RUNTIME_data_getaddr_withconversion( options, STARPU_R, Cflttype, A, Am, An ), + STARPU_R, RUNTIME_data_getaddr_withconversion( options, STARPU_R, Cflttype, B, Bm, Bn ), + accessC, RUNTIME_data_getaddr_withconversion( options, accessC, Cflttype, C, Cm, Cn ), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); + + return; +} diff --git a/runtime/starpu/codelets/codelet_gemmex.c b/runtime/starpu/codelets/codelet_gemmex.c new file mode 100644 index 0000000000000000000000000000000000000000..efc33dab33d5fef8c8186aebf0c53e401f9ef2ba --- /dev/null +++ b/runtime/starpu/codelets/codelet_gemmex.c @@ -0,0 +1,278 @@ +/** + * + * @file starpu/codelet_gemmex.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon gemmex StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" + +CHAMELEON_CL_CB( gemmex, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ + +struct cl_gemmex_args_s { + cham_trans_t transA; + cham_trans_t transB; + int m; + int n; + int k; + double alpha; + double beta; +}; + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_gemmex_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_gemmex_args_s *clargs = (struct cl_gemmex_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + void *ptrAlpha, *ptrBeta; + + switch( tileC->flttype ) { + case ChamRealHalf: + { + CHAMELEON_Real16_t halpha = clargs->alpha; + CHAMELEON_Real16_t hbeta = clargs->beta; + ptrAlpha = &halpha; + ptrBeta = &hbeta; + } + break; + case ChamRealFloat: + { + float salpha = clargs->alpha; + float sbeta = clargs->beta; + ptrAlpha = &salpha; + ptrBeta = &sbeta; + } + break; + case ChamRealDouble: + { + double dalpha = clargs->alpha; + double dbeta = clargs->beta; + ptrAlpha = &dalpha; + ptrBeta = &dbeta; + } + break; + case ChamComplexFloat: + { + CHAMELEON_Complex32_t calpha = clargs->alpha; + CHAMELEON_Complex32_t cbeta = clargs->beta; + ptrAlpha = &calpha; + ptrBeta = &cbeta; + } + break; + case ChamComplexDouble: + { + CHAMELEON_Complex64_t zalpha = clargs->alpha; + CHAMELEON_Complex64_t zbeta = clargs->beta; + ptrAlpha = &zalpha; + ptrBeta = &zbeta; + } + break; + default: + fprintf( stderr, "cl_gemmex: Unknown C datatype\n" ); + return; + } + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + CUDA_gemmex( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + ptrAlpha, + tileA->mat, tileA->ld, tileA->flttype, + tileB->mat, tileB->ld, tileB->flttype, + ptrBeta, + tileC->mat, tileC->ld, tileC->flttype, + handle ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( gemmex, NULL, cl_gemmex_cuda_func, STARPU_CUDA_ASYNC ) + +void INSERT_TASK_gemmex_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_hlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_gemmex_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "gemmex_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_gemmex_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_gemmex_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_gemmex, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_gemmex_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_gemmex( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + double alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + double beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_gemmex_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "gemmex"; + + if ( !(options->withcuda) ) { + /* Fallback to cpu version */ + INSERT_TASK_gemm( options, transA, transB, m, n, k, nb, + alpha, A, Am, An, B, Bm, Bn, beta, C, Cm, Cn ); + return; + } + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_gemmex_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_gemmex_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_gemmex, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_gemmex_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} diff --git a/runtime/starpu/codelets/codelet_hgemm.c b/runtime/starpu/codelets/codelet_hgemm.c new file mode 100644 index 0000000000000000000000000000000000000000..a595bf990cd336e50aee8c541954ef5591c5ed8b --- /dev/null +++ b/runtime/starpu/codelets/codelet_hgemm.c @@ -0,0 +1,250 @@ +/** + * + * @file starpu/codelet_hgemm.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon hgemm StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelets.h" + +CHAMELEON_CL_CB( hgemm, cti_handle_get_m(task->handles[2]), cti_handle_get_n(task->handles[2]), cti_handle_get_n(task->handles[0]), 2. *M*N*K) /* If A^t, computation is wrong */ + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_hgemm_cuda_func( void *descr[], void *cl_arg ) +{ + struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + CUDA_hgemm( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (CHAMELEON_Real16_t*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (CHAMELEON_Real16_t*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_hgemm_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_hgemm_args_s *clargs = (struct cl_hgemm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + HIP_hgemm( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (CHAMELEON_Real16_t*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (CHAMELEON_Real16_t*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); + + return; +} +#endif /* defined(CHAMELEON_USE_HIP) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( hgemm, NULL, cl_hgemm_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( hgemm, NULL, cl_hgemm_cuda_func, STARPU_CUDA_ASYNC ) +#endif + +void INSERT_TASK_hgemm_Astat( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_hlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_hgemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "hgemm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_hgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_hgemm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_hgemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_hgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Real16_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Real16_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + /* if ( alpha == 0. ) { */ + /* INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, */ + /* beta, C, Cm, Cn ); */ + /* return; */ + /* } */ + + struct cl_hgemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "hgemm"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_hgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->beta = beta; + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_hgemm_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + + /* Refine name */ + cl_name = chameleon_codelet_name( cl_name, 3, + A->get_blktile( A, Am, An ), + B->get_blktile( B, Bm, Bn ), + C->get_blktile( C, Cm, Cn ) ); + + /* Insert the task */ + rt_starpu_insert_task( + &cl_hgemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_hgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, ChamRealHalf, Am, An), + STARPU_R, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + accessC, RTBLKADDR(C, ChamRealHalf, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c index 97de57e26efa988a6e425573d637d13115f8f2d3..a55c56ef8fc79e84db3a04a9fa332e889443192b 100644 --- a/runtime/starpu/codelets/codelet_map.c +++ b/runtime/starpu/codelets/codelet_map.c @@ -9,10 +9,10 @@ * * @brief Chameleon map StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * */ #include "chameleon_starpu.h" @@ -49,25 +49,29 @@ void INSERT_TASK_map( const RUNTIME_option_t *options, struct starpu_codelet *codelet = &cl_map; void (*callback)(void*) = options->profiling ? cl_map_callback : NULL; + char *cl_name = "map"; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); CHAMELEON_END_ACCESS_DECLARATION; + cl_name = chameleon_codelet_name( cl_name, 1, + A->get_blktile( A, Am, An ) ); + rt_starpu_insert_task( codelet, STARPU_VALUE, &A, sizeof(CHAM_desc_t*), STARPU_VALUE, &uplo, sizeof(cham_uplo_t), STARPU_VALUE, &Am, sizeof(int), STARPU_VALUE, &An, sizeof(int), - cham_to_starpu_access(accessA), RTBLKADDR(A, void, Am, An), + cham_to_starpu_access(accessA), RTBLKADDR(A, ChamByte, Am, An), STARPU_VALUE, &op_fct, sizeof(cham_unary_operator_t), STARPU_VALUE, &op_args, sizeof(void*), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "map", + STARPU_NAME, cl_name, #endif 0); } diff --git a/runtime/starpu/codelets/codelet_zaxpy.c b/runtime/starpu/codelets/codelet_zaxpy.c index 2c1170dacd7cc1a569627028ccb0989bfcd9e1ce..be1fbbbf422325c84068513392c11bb7352c424c 100644 --- a/runtime/starpu/codelets/codelet_zaxpy.c +++ b/runtime/starpu/codelets/codelet_zaxpy.c @@ -11,11 +11,11 @@ * * @brief Chameleon zaxpy StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -65,9 +65,9 @@ void INSERT_TASK_zaxpy( const RUNTIME_option_t *options, codelet, STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &incA, sizeof(int), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_VALUE, &incB, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_zbuild.c b/runtime/starpu/codelets/codelet_zbuild.c index ad92712dab565e6969e21f09669b3271a1c24d19..3ce93dea06a18299c2ac4fa0d7c471b8b975d297 100644 --- a/runtime/starpu/codelets/codelet_zbuild.c +++ b/runtime/starpu/codelets/codelet_zbuild.c @@ -11,7 +11,7 @@ * * @brief Chameleon zbuild StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Piotr Luszczek @@ -23,7 +23,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -79,7 +79,7 @@ CODELETS_CPU(zbuild, cl_zbuild_cpu_func) STARPU_VALUE, &row_max, sizeof(int), STARPU_VALUE, &col_min, sizeof(int), STARPU_VALUE, &col_max, sizeof(int), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &user_data, sizeof(void*), STARPU_VALUE, &user_build_callback, sizeof(void*), STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 8f05509f70faf0d971c15ecb42aa064738611631..7d1bf87f6c669c18c06d72727d45713be1b86189 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -11,12 +11,12 @@ * * @brief Chameleon zcallback StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -84,3 +84,7 @@ CHAMELEON_CL_CB(ztsmqr_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(ztstrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), M* M*M) CHAMELEON_CL_CB(zunmlq, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) CHAMELEON_CL_CB(zunmqr, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) +#if defined(PRECISION_d) || defined(PRECISION_s) +CHAMELEON_CL_CB(dlag2h, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(hlag2d, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +#endif diff --git a/runtime/starpu/codelets/codelet_zcesca.c b/runtime/starpu/codelets/codelet_zcesca.c index e2bdf69b2b841c28dcd58bcf1b848f9a167dff41..74cb9c7c6834723ae1a536b933b450c8b6439344 100644 --- a/runtime/starpu/codelets/codelet_zcesca.c +++ b/runtime/starpu/codelets/codelet_zcesca.c @@ -9,9 +9,9 @@ * * @brief Chameleon zcesca StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -96,12 +96,12 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zcesca_args_s), - STARPU_R, RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin), - STARPU_R, RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn), - STARPU_R, RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn), - STARPU_R, RTBLKADDR(Di, double, Dim, Din), - STARPU_R, RTBLKADDR(Dj, double, Djm, Djn), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(Gi, ChamComplexDouble, Gim, Gin), + STARPU_R, RTBLKADDR(Gj, ChamComplexDouble, Gjm, Gjn), + STARPU_R, RTBLKADDR(G, ChamComplexDouble, Gm, Gn), + STARPU_R, RTBLKADDR(Di, ChamRealDouble, Dim, Din), + STARPU_R, RTBLKADDR(Dj, ChamRealDouble, Djm, Djn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index d39ace8e1a1b0f1784184fdba9de4d7c45b46d0f..60daa3b177a7ec86bcbba840c0e64dd33e72b39b 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -11,14 +11,14 @@ * * @brief Chameleon zgeadd StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -104,9 +104,9 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessB, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index 2bacf5dfc1683a0731bbb097e552660d5cd8d380..846482361badc28556fb6da43492b38754354ece 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgelqt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -80,8 +80,8 @@ void INSERT_TASK_zgelqt(const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), /* max( nb * (ib+1), ib * (ib+nb) ) */ STARPU_SCRATCH, options->ws_worker, /* /\* ib*n + 3*ib*ib + max(m,n) *\/ */ diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index cf9b4b168f1abdfe222b1cd3be504b7dfb6bf38b..4b47627258ba618751e82a33945f3fbeed27c8bc 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -31,16 +31,6 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -struct cl_zgemm_args_s { - cham_trans_t transA; - cham_trans_t transB; - int m; - int n; - int k; - CHAMELEON_Complex64_t alpha; - CHAMELEON_Complex64_t beta; -}; - #if !defined(CHAMELEON_SIMULATION) static void cl_zgemm_cpu_func( void *descr[], void *cl_arg ) @@ -201,9 +191,9 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, @@ -272,9 +262,9 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c index 4d0f2dd0920df83898c9f6f0222a49ddc567af49..a2529d99995bf272d4e960916c61c82fea26b762 100644 --- a/runtime/starpu/codelets/codelet_zgemv.c +++ b/runtime/starpu/codelets/codelet_zgemv.c @@ -11,9 +11,9 @@ * * @brief Chameleon zgemv StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -109,11 +109,11 @@ void INSERT_TASK_zgemv( const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(X, CHAMELEON_Complex64_t, Xm, Xn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(X, ChamComplexDouble, Xm, Xn), STARPU_VALUE, &incX, sizeof(int), STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessY, RTBLKADDR(Y, CHAMELEON_Complex64_t, Ym, Yn), + accessY, RTBLKADDR(Y, ChamComplexDouble, Ym, Yn), STARPU_VALUE, &incY, sizeof(int), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index f77f09c5da068b333fc564c94c6287685e936cd9..0ee282e4a02c5bb1dc47e82c474c88a445144114 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgeqrt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -81,8 +81,8 @@ void INSERT_TASK_zgeqrt(const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), /* max( nb * (ib+1), ib * (ib+nb) ) */ STARPU_SCRATCH, options->ws_worker, /* ib * (m+3*ib) + max(m,n) */ diff --git a/runtime/starpu/codelets/codelet_zgered.c b/runtime/starpu/codelets/codelet_zgered.c new file mode 100644 index 0000000000000000000000000000000000000000..394ef74c24ac9023a1b594b348ba21479df48d3a --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgered.c @@ -0,0 +1,142 @@ +/** + * + * @file starpu/codelet_zgered.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgered StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_starpu.h" +#include <coreblas/lapacke.h> +#include "runtime_codelet_zc.h" +#include "runtime_codelet_z.h" + +//#define CHAMELEON_DEBUG_GERED + +void INSERT_TASK_zgered( const RUNTIME_option_t *options, + double threshold, double Anorm, int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + CHAM_tile_t *tileA; + double u_low; + int64_t mm, nn; +#if defined(CHAMELEON_USE_MPI) + int tag; +#endif + starpu_data_handle_t *handleAin; + starpu_data_handle_t handleAout; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + /* Get the Input handle */ + mm = Am + (A->i / A->mb); + nn = An + (A->j / A->nb); + handleAin = A->schedopt; + handleAin += ((int64_t)A->lmt) * nn + mm; + + assert( *handleAin != NULL ); + + /* + * Lets convert the tile precision based on the following criteria: + * + * ||A_{i,j}||_F < u_{high} * || A ||_F / (nt * u_{low}) + * ||A_{i,j}||_F < u_{high} * || A ||_F / nt * 1/ u_{low} + * ||A_{i,j}||_F < threshold / u_{low} + */ + + tileA = A->get_blktile( A, Am, An ); +#if defined(CHAMELEON_USE_MPI) + tag = starpu_mpi_data_get_tag( *handleAin ); +#endif /* defined(CHAMELEON_USE_MPI) */ + +#if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500) +#if defined(PRECISION_d) + if ( options->withcuda ) { + /* + * Check for half precision + */ + u_low = 1.e-4; + if ( Anorm < (threshold / u_low) ) { +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] Convert the tile ( %d, %d ) to half precision\n", + A->myrank, Am, An ); +#endif + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexHalf ); + + rt_starpu_insert_task( + &cl_dlag2h, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2h", +#endif + 0); + + starpu_data_unregister_submit( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexHalf; +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( handleAout, tag, tileA->rank ); +#endif + return; + } + } +#endif +#endif + + /* + * Check for single precision + */ +#if !defined(CHAMELEON_SIMULATION) + u_low = LAPACKE_slamch_work('e'); +#else + u_low = 1e-8; +#endif + if ( Anorm < (threshold / u_low) ) { +#if defined(CHAMELEON_DEBUG_GERED) + fprintf( stderr, + "[%2d] Convert the tile ( %d, %d ) to single precision\n", + A->myrank, Am, An ); +#endif + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexFloat ); + + rt_starpu_insert_task( + &cl_zlag2c, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "zlag2c", +#endif + 0); + + starpu_data_unregister_submit( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexFloat; +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( *handleAin, tag, tileA->rank ); +#endif + return; + } +} diff --git a/runtime/starpu/codelets/codelet_zgerst.c b/runtime/starpu/codelets/codelet_zgerst.c new file mode 100644 index 0000000000000000000000000000000000000000..68490f011eef8227bfa656bafdf60ab4c0e70e79 --- /dev/null +++ b/runtime/starpu/codelets/codelet_zgerst.c @@ -0,0 +1,118 @@ +/** + * + * @file starpu/codelet_zgerst.c + * + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon zgerst StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions normal z -> d + * + */ +#include "chameleon_starpu.h" +#include <coreblas/lapacke.h> +#include "runtime_codelet_zc.h" +#include "runtime_codelet_z.h" + +//#define CHAMELEON_DEBUG_GERST + +void INSERT_TASK_zgerst( const RUNTIME_option_t *options, + int m, int n, + const CHAM_desc_t *A, int Am, int An ) +{ + CHAM_tile_t *tileA; + int64_t mm, nn; +#if defined(CHAMELEON_USE_MPI) + int tag; +#endif + starpu_data_handle_t *handleAin; + starpu_data_handle_t handleAout; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_RW(A, Am, An); + CHAMELEON_END_ACCESS_DECLARATION; + + tileA = A->get_blktile( A, Am, An ); + if ( tileA->flttype == ChamComplexDouble ) { + return; + } + + /* Get the Input handle */ + mm = Am + (A->i / A->mb); + nn = An + (A->j / A->nb); + handleAin = A->schedopt; + handleAin += ((int64_t)A->lmt) * nn + mm; + + assert( *handleAin != NULL ); + +#if defined(CHAMELEON_USE_MPI) + tag = starpu_mpi_data_get_tag( *handleAin ); +#endif /* defined(CHAMELEON_USE_MPI) */ + + starpu_cham_tile_register( &handleAout, -1, tileA, ChamComplexDouble ); + + switch( tileA->flttype ) { +#if defined(CHAMELEON_USE_CUDA) && (CUDA_VERSION >= 7500) +#if defined(PRECISION_d) + /* + * Restore from half precision + */ + case ChamComplexHalf: +#if defined(CHAMELEON_DEBUG_GERST) + fprintf( stderr, + "[%2d] Convert back the tile ( %d, %d ) from half precision\n", + A->myrank, Am, An ); +#endif + rt_starpu_insert_task( + &cl_hlag2d, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "hlag2d", +#endif + 0); + break; +#endif +#endif + + case ChamComplexFloat: +#if defined(CHAMELEON_DEBUG_GERST) + fprintf( stderr, + "[%2d] Convert back the tile ( %d, %d ) from half precision\n", + A->myrank, Am, An ); +#endif + rt_starpu_insert_task( + &cl_clag2z, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, *handleAin, + STARPU_W, handleAout, + STARPU_PRIORITY, options->priority, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "clag2z", +#endif + 0); + break; + + default: + fprintf( stderr, "ERROR: Unknonw input datatype" ); + } + + starpu_data_unregister_submit( *handleAin ); + *handleAin = handleAout; + tileA->flttype = ChamComplexDouble; +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( handleAout, tag, tileA->rank ); +#endif +} diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c index 5a53839a635243b0fb8904cd7768d852a984b70a..7c218ab0075ef5c56ceda76360c9797fe765546d 100644 --- a/runtime/starpu/codelets/codelet_zgersum.c +++ b/runtime/starpu/codelets/codelet_zgersum.c @@ -11,11 +11,11 @@ * * @brief Chameleon zgersum StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Romain Peressoni * @author Mathieu Faverge * @author Antoine Jego - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -117,7 +117,7 @@ void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) { #if defined(HAVE_STARPU_MPI_REDUX) - starpu_data_set_reduction_methods( RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + starpu_data_set_reduction_methods( RTBLKADDR(A, ChamComplexDouble, Am, An), &cl_zgersum_redux, &cl_zgersum_init ); #endif @@ -129,7 +129,7 @@ RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, { #if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI) starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD, - RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + RTBLKADDR(A, ChamComplexDouble, Am, An), options->priority + 1, 2 /* Binary tree */ ); #else diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index edd99dc349250967a54a951f0e83eddda11bcb9c..2afae321494854f744c6ad1fcf9161db8701b402 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgessm StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -77,9 +77,9 @@ void INSERT_TASK_zgessm( const RUNTIME_option_t *options, STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &IPIV, sizeof(int*), - STARPU_R, RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), - STARPU_R, RTBLKADDR(D, CHAMELEON_Complex64_t, Dm, Dn), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(L, ChamComplexDouble, Lm, Ln), + STARPU_R, RTBLKADDR(D, ChamComplexDouble, Dm, Dn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgessq.c b/runtime/starpu/codelets/codelet_zgessq.c index 8972fea258861e817c83486682a8d844c3925bc6..e7de6b666101fbb6d240e85ead4bcb849f9dbe24 100644 --- a/runtime/starpu/codelets/codelet_zgessq.c +++ b/runtime/starpu/codelets/codelet_zgessq.c @@ -11,14 +11,14 @@ * * @brief Chameleon zgessq StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -65,8 +65,8 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options, STARPU_VALUE, &storev, sizeof(cham_store_t), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(SCALESUMSQ, ChamRealDouble, SCALESUMSQm, SCALESUMSQn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgesum.c b/runtime/starpu/codelets/codelet_zgesum.c index d8b64642637f1a8c4e13026495a132344ee3bd4a..d1947b4a1ecb6666008e63c9b82f558c1bb60e8e 100644 --- a/runtime/starpu/codelets/codelet_zgesum.c +++ b/runtime/starpu/codelets/codelet_zgesum.c @@ -9,9 +9,9 @@ * * @brief Chameleon zgesum StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -69,8 +69,8 @@ void INSERT_TASK_zgesum( const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, STARPU_CL_ARGS, clargs, sizeof(struct cl_zgesum_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(SUMS, ChamComplexDouble, SUMSm, SUMSn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zgetrf.c b/runtime/starpu/codelets/codelet_zgetrf.c index 6f667ac70c1869cbe24196553bf7fa8247ff7d6e..568265ee27721d0c34d80012e860514772a414b4 100644 --- a/runtime/starpu/codelets/codelet_zgetrf.c +++ b/runtime/starpu/codelets/codelet_zgetrf.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgetrf StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -74,7 +74,7 @@ void INSERT_TASK_zgetrf( const RUNTIME_option_t *options, codelet, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &IPIV, sizeof(int*), STARPU_VALUE, &check_info, sizeof(cham_bool_t), STARPU_VALUE, &iinfo, sizeof(int), diff --git a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c index 5ea78b61145999a5ce02803361e5448c9aa9f562..03398f6447c34fb15f85dba005d2a92cb9d1aa9a 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_incpiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_incpiv.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgetrf_incpiv StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -83,8 +83,8 @@ void INSERT_TASK_zgetrf_incpiv(const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(L, ChamComplexDouble, Lm, Ln), STARPU_VALUE, &IPIV, sizeof(int*), STARPU_VALUE, &check_info, sizeof(cham_bool_t), STARPU_VALUE, &iinfo, sizeof(int), diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c index 9c40effb01e380ebba58794a1ba06dc1673965a3..4488882e796ff7d9cb87a07f57e9b40c3f49036c 100644 --- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c @@ -11,7 +11,7 @@ * * @brief Chameleon zgetrf_nopiv StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Omar Zenati * @author Mathieu Faverge * @author Emmanuel Agullo @@ -19,7 +19,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -75,7 +75,7 @@ void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &iinfo, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), diff --git a/runtime/starpu/codelets/codelet_zgram.c b/runtime/starpu/codelets/codelet_zgram.c index 83437b670113205d7d88dac910895997695440e2..883093966a5355c00a2fc9f0de9d35bd4b0c823c 100644 --- a/runtime/starpu/codelets/codelet_zgram.c +++ b/runtime/starpu/codelets/codelet_zgram.c @@ -9,11 +9,11 @@ * * @brief Chameleon zgram StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost * @author Lucas Barros de Assis - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -70,10 +70,10 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &mt, sizeof(int), STARPU_VALUE, &nt, sizeof(int), - STARPU_R, RTBLKADDR(Di, double, Dim, Din), - STARPU_R, RTBLKADDR(Dj, double, Djm, Djn), - STARPU_R, RTBLKADDR(D, double, Dm, Dn), - STARPU_RW, RTBLKADDR(A, double, Am, An), + STARPU_R, RTBLKADDR(Di, ChamRealDouble, Dim, Din), + STARPU_R, RTBLKADDR(Dj, ChamRealDouble, Djm, Djn), + STARPU_R, RTBLKADDR(D, ChamRealDouble, Dm, Dn), + STARPU_RW, RTBLKADDR(A, ChamRealDouble, Am, An), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c index 638b3a9f61d71a2d002c0f4bbb2ddf23a449d0d9..2fa77bfb0464567fdbdf1f4decac0627c39c8076 100644 --- a/runtime/starpu/codelets/codelet_zhe2ge.c +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -11,12 +11,12 @@ * * @brief Chameleon zhe2ge StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -65,8 +65,8 @@ void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options, STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 67fe91b28888f1532966332f275683e9c6ba5717..c38e525963aed0aef6789129b8ae2cb8aa8a96d7 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -193,9 +193,9 @@ void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, @@ -263,9 +263,9 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 96ff9fbf871d309c7674da923826d2989729c7da..223522bbb6559cb1b753579bf48ac608089bdd92 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -11,7 +11,7 @@ * * @brief Chameleon zher2k StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Loris Lucido - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c * */ @@ -154,10 +154,10 @@ INSERT_TASK_zher2k( const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_VALUE, &beta, sizeof(double), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c index 9665901ee5d3cd196db65ed8c1030a1be357b3e1..1429529684ee9630f21717f1bc1d594c06697469 100644 --- a/runtime/starpu/codelets/codelet_zherfb.c +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -11,12 +11,12 @@ * * @brief Chameleon zherfb StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Lucas Barros de Assis * @author Florent Pruvost * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -106,9 +106,9 @@ void INSERT_TASK_zherfb(const RUNTIME_option_t *options, STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &nb, sizeof(int), STARPU_VALUE, &nb, sizeof(int), /* ldw */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_RW, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), + STARPU_RW, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index c53607f9d1034671d29bb2a4160d26e535daf598..c9a9e3017c11c26ce042ea8ac9c3990d2db1b0c1 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -157,8 +157,8 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, &cl_zherk, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zherk_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 5a0deb8da5757a4c9120703de5b82459adc0da00..1614e45af17f684366b36d359f0c30fcafe623d3 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -112,8 +112,8 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, &cl_zlacpyx, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Common task arguments */ STARPU_PRIORITY, options->priority, @@ -162,8 +162,8 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, &cl_zlacpy, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index b6ad117f48741309e6b1ab2e54ac2a827a155982..961cc9c0376c0fa48ce6bbf0fe2b4e183e3714ff 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -11,16 +11,14 @@ * * @brief Chameleon zlag2c StarPU codelet * - * @version 1.2.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 + * @version 1.3.0 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions mixed zc -> ds * */ @@ -28,7 +26,8 @@ #include "runtime_codelet_zc.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) +static void +cl_zlag2c_cpu_func( void *descr[], void *cl_arg ) { int info = 0; int m; @@ -39,15 +38,44 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); + starpu_codelet_unpack_args( cl_arg, &m, &n ); TCORE_zlag2c( m, n, tileA, tileB, &info ); } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zlag2c_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_zlag2c( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_zlag2c failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlag2c, cl_zlag2c_cpu_func) +CODELETS( zlag2c, cl_zlag2c_cpu_func, cl_zlag2c_cuda_func, STARPU_CUDA_ASYNC ) /** * @@ -72,8 +100,8 @@ void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, codelet, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex32_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexFloat, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -97,17 +125,45 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_clag2z( m, n, tileA, tileB); } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_clag2z_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_clag2z( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + if ( rc != 0 ) { + fprintf( stderr, "core_clag2z failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(clag2z, cl_clag2z_cpu_func) +CODELETS( clag2z, cl_clag2z_cpu_func, cl_clag2z_cuda_func, STARPU_CUDA_ASYNC ) -void INSERT_TASK_clag2z(const RUNTIME_option_t *options, - int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_clag2z( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { (void)nb; struct starpu_codelet *codelet = &cl_clag2z; @@ -122,8 +178,8 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options, codelet, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexFloat, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlange.c b/runtime/starpu/codelets/codelet_zlange.c index 38d5d7304e9768612917fa9344e9e4e6240182eb..1747215a60e72b1cdac0c7e77bfff04a31725ce7 100644 --- a/runtime/starpu/codelets/codelet_zlange.c +++ b/runtime/starpu/codelets/codelet_zlange.c @@ -11,7 +11,7 @@ * * @brief Chameleon zlange StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -70,9 +70,9 @@ void INSERT_TASK_zlange( const RUNTIME_option_t *options, STARPU_VALUE, &norm, sizeof(cham_normtype_t), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_SCRATCH, options->ws_worker, - STARPU_W, RTBLKADDR(B, double, Bm, Bn), + STARPU_W, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -121,8 +121,8 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options, rt_starpu_insert_task( codelet, - STARPU_R, RTBLKADDR(A, double, Am, An), - STARPU_RW, RTBLKADDR(B, double, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamRealDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlanhe.c b/runtime/starpu/codelets/codelet_zlanhe.c index 0245319f3f8ab78997c60b5b9332eb2be265f223..c45ecc83987918785f212fd8922453e679ecb63f 100644 --- a/runtime/starpu/codelets/codelet_zlanhe.c +++ b/runtime/starpu/codelets/codelet_zlanhe.c @@ -11,7 +11,7 @@ * * @brief Chameleon zlanhe StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c * */ @@ -69,9 +69,9 @@ void INSERT_TASK_zlanhe(const RUNTIME_option_t *options, STARPU_VALUE, &norm, sizeof(int), STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_SCRATCH, options->ws_worker, - STARPU_W, RTBLKADDR(B, double, Bm, Bn), + STARPU_W, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlansy.c b/runtime/starpu/codelets/codelet_zlansy.c index f75ad866e0066addc6c0d32c6f3882ada8f09809..21b5a1140c3a9d94d032308cf81a55b751844dbd 100644 --- a/runtime/starpu/codelets/codelet_zlansy.c +++ b/runtime/starpu/codelets/codelet_zlansy.c @@ -11,7 +11,7 @@ * * @brief Chameleon zlansy StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Julien Langou @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -70,9 +70,9 @@ void INSERT_TASK_zlansy( const RUNTIME_option_t *options, STARPU_VALUE, &norm, sizeof(int), STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_SCRATCH, options->ws_worker, - STARPU_W, RTBLKADDR(B, double, Bm, Bn), + STARPU_W, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlantr.c b/runtime/starpu/codelets/codelet_zlantr.c index ff96adb2c9a80100cee65978a44eea36865eb35e..e2067a02c392391c39dc28e9bf81ebf922defc4e 100644 --- a/runtime/starpu/codelets/codelet_zlantr.c +++ b/runtime/starpu/codelets/codelet_zlantr.c @@ -11,14 +11,14 @@ * * @brief Chameleon zlantr StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Florent Pruvost * @author Lucas Barros de Assis * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -71,9 +71,9 @@ void INSERT_TASK_zlantr( const RUNTIME_option_t *options, STARPU_VALUE, &diag, sizeof(int), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_SCRATCH, options->ws_worker, - STARPU_W, RTBLKADDR(B, double, Bm, Bn), + STARPU_W, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index 19162d6388142d66808bd436e959aa048dc2b831..1246af7734068fdb4b836e03ba87adee2751ba69 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -91,7 +91,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, &cl_zlascal, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlascal_args_s), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c index c6acdc531ba2b8b9245aca7716885672caf0a5be..b78055d3385b4609cb43c1a6056e99cd58815e07 100644 --- a/runtime/starpu/codelets/codelet_zlaset.c +++ b/runtime/starpu/codelets/codelet_zlaset.c @@ -85,7 +85,7 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, &cl_zlaset, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaset_args_s), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zlaset2.c b/runtime/starpu/codelets/codelet_zlaset2.c index 9c1bc9f3d9b5c397ef11478d9e2f803faa97b66e..33a136f359bb8979d09f6222bddc866e2ef2d437 100644 --- a/runtime/starpu/codelets/codelet_zlaset2.c +++ b/runtime/starpu/codelets/codelet_zlaset2.c @@ -11,7 +11,7 @@ * * @brief Chameleon zlaset2 StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -21,7 +21,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -66,7 +66,7 @@ void INSERT_TASK_zlaset2(const RUNTIME_option_t *options, STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlatro.c b/runtime/starpu/codelets/codelet_zlatro.c index 9befacd6780ffde876cd38da6efe15d18bf7ae9a..42fa0637df433f277656c09c259b55d33df347e3 100644 --- a/runtime/starpu/codelets/codelet_zlatro.c +++ b/runtime/starpu/codelets/codelet_zlatro.c @@ -11,7 +11,7 @@ * * @brief Chameleon zlatro StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Julien Langou @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -77,8 +77,8 @@ void INSERT_TASK_zlatro( const RUNTIME_option_t *options, STARPU_VALUE, &trans, sizeof(int), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index bc49ece66b29f26d901891c3ca6900bbee749a87..bdb000e520bebbc92b8aaa45adbf883bb6d5ed52 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -83,7 +83,7 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, &cl_zlauum, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlauum_args_s), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zpanel.c b/runtime/starpu/codelets/codelet_zpanel.c index 0b917134ef9ce05f6f91bca822f9d8b5b3d18dd1..7e450986c7448fbc1bd6c2398a7e21df9c63bd89 100644 --- a/runtime/starpu/codelets/codelet_zpanel.c +++ b/runtime/starpu/codelets/codelet_zpanel.c @@ -9,12 +9,12 @@ * * @brief Chameleon zpanel StarPU codelets * - * @version 1.2.0 + * @version 1.3.0 * @comment Codelets to perform panel factorization with partial pivoting * * @author Mathieu Faverge * @author Matthieu Kuhn - * @date 2023-02-21 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -93,8 +93,8 @@ void INSERT_TASK_zgetrf_panel_nopiv_percol_diag( const RUNTIME_option_t *options STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(U, ChamComplexDouble, Um, Un), STARPU_VALUE, &iinfo, sizeof(int), STARPU_VALUE, &(options->sequence), sizeof(RUNTIME_sequence_t*), STARPU_VALUE, &(options->request), sizeof(RUNTIME_request_t*), @@ -165,8 +165,8 @@ void INSERT_TASK_zgetrf_panel_nopiv_percol_trsm( const RUNTIME_option_t *options STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(U, ChamComplexDouble, Um, Un), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c index 34bee5797a5c7355a33d073c7b275ed6dcc5eadf..bc3052e2730e8d394bb2bc4ac7a93f44a0642e1a 100644 --- a/runtime/starpu/codelets/codelet_zplghe.c +++ b/runtime/starpu/codelets/codelet_zplghe.c @@ -89,7 +89,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, &cl_zplghe, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zplghe_args_s), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c index e39a3c69bc23ba9e16cbc9fce04734ea0332917c..fd29f8d0a0263bf97c7e76aa97b746e5febf6099 100644 --- a/runtime/starpu/codelets/codelet_zplgsy.c +++ b/runtime/starpu/codelets/codelet_zplgsy.c @@ -89,7 +89,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, &cl_zplgsy, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zplgsy_args_s), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index 33a2bee23ca7b7ee93ebe9bbbc3d6f7cff2ff00b..abb1ebd399c75d3b8db3cc77dd3239e1c81f5047 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -88,7 +88,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, &cl_zplrnt, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zplrnt_args_s), - STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c index 5d76a91f733a55fb919297ad3c512ac39d57bbfa..6a1c7a54d0410bb1a1e5aa64c03c25235c60ebdd 100644 --- a/runtime/starpu/codelets/codelet_zplssq.c +++ b/runtime/starpu/codelets/codelet_zplssq.c @@ -11,13 +11,13 @@ * * @brief Chameleon zplssq StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -69,8 +69,8 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options, STARPU_VALUE, &storev, sizeof(int), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR( IN, double, INm, INn ), - STARPU_RW, RTBLKADDR( OUT, double, OUTm, OUTn ), + STARPU_R, RTBLKADDR( IN, ChamRealDouble, INm, INn ), + STARPU_RW, RTBLKADDR( OUT, ChamRealDouble, OUTm, OUTn ), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -115,7 +115,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N, rt_starpu_insert_task( codelet, STARPU_VALUE, &N, sizeof(int), - STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn), + STARPU_RW, RTBLKADDR(RESULT, ChamRealDouble, RESULTm, RESULTn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index b8b18765db71cecf6a9f79a4b3a333cb272f165d..1195a1e93e5d13db02e75c3bcd962a95394cd688 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -100,7 +100,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, &cl_zpotrf, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zpotrf_args_s), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zssssm.c b/runtime/starpu/codelets/codelet_zssssm.c index b0f413cf8b9956aca377adf39c11d1faa871f1bf..f9cece0bf2156ec58922f77fdd10d077c7150875 100644 --- a/runtime/starpu/codelets/codelet_zssssm.c +++ b/runtime/starpu/codelets/codelet_zssssm.c @@ -11,7 +11,7 @@ * * @brief Chameleon zssssm StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -85,10 +85,10 @@ void INSERT_TASK_zssssm( const RUNTIME_option_t *options, STARPU_VALUE, &n2, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_R, RTBLKADDR(L1, CHAMELEON_Complex64_t, L1m, L1n), - STARPU_R, RTBLKADDR(L2, CHAMELEON_Complex64_t, L2m, L2n), + STARPU_RW, RTBLKADDR(A1, ChamComplexDouble, A1m, A1n), + STARPU_RW, RTBLKADDR(A2, ChamComplexDouble, A2m, A2n), + STARPU_R, RTBLKADDR(L1, ChamComplexDouble, L1m, L1n), + STARPU_R, RTBLKADDR(L2, ChamComplexDouble, L2m, L2n), STARPU_VALUE, &IPIV, sizeof(int*), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index ed8dea80b401d87dc8bd953c169eb11d2945cb7d..6748325a666032c7d7cb210da4f49cb1b3d247ed 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -194,9 +194,9 @@ void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, @@ -264,9 +264,9 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s), /* Task handles */ - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index e6dd471e68010f04a3296b3dd5299b19e76c8537..51f72d20ea50b88bc00120ce412338ff0d00bad0 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -11,7 +11,7 @@ * * @brief Chameleon zsyr2k StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -20,7 +20,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Loris Lucido - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -153,10 +153,10 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index f11dd4158341615defeb35755d8c8dcabb28ebce..9df9fd08ad4e79cd8017f95a0596eca2e3c872f0 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -157,8 +157,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, &cl_zsyrk, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyrk_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + accessC, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zsyssq.c b/runtime/starpu/codelets/codelet_zsyssq.c index e50ceb11e09336f79600b24f7c6255746ffe9036..cf710e468f8e3f6135357b38549a177903981d97 100644 --- a/runtime/starpu/codelets/codelet_zsyssq.c +++ b/runtime/starpu/codelets/codelet_zsyssq.c @@ -11,13 +11,13 @@ * * @brief Chameleon zsyssq StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -63,8 +63,8 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options, STARPU_VALUE, &storev, sizeof(cham_store_t), STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(SCALESUMSQ, ChamRealDouble, SCALESUMSQm, SCALESUMSQn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_zsytrf_nopiv.c b/runtime/starpu/codelets/codelet_zsytrf_nopiv.c index 0cf3b092f721fb6cfbf53b50c45a1114bacb41de..5882c5855b303b90c56636aa0a3f9a64afcc11e5 100644 --- a/runtime/starpu/codelets/codelet_zsytrf_nopiv.c +++ b/runtime/starpu/codelets/codelet_zsytrf_nopiv.c @@ -11,7 +11,7 @@ * * @brief Chameleon zsytrf_nopiv StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Jakub Kurzak * @author Mathieu Faverge @@ -21,7 +21,7 @@ * @author Marc Sergent * @author Lucas Barros de Assis * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c * */ @@ -64,7 +64,7 @@ void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options, codelet, STARPU_VALUE, &uplo, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), STARPU_VALUE, &iinfo, sizeof(int), /* STARPU_SCRATCH, options->ws_worker, */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztplqt.c b/runtime/starpu/codelets/codelet_ztplqt.c index dc68af0f1f598db8bedd8af0ddb6a6d84c8b0aa5..3471d4878436fd2d49f4ca1835195eb73f03de37 100644 --- a/runtime/starpu/codelets/codelet_ztplqt.c +++ b/runtime/starpu/codelets/codelet_ztplqt.c @@ -11,11 +11,11 @@ * * @brief Chameleon ztplqt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> s d c * */ @@ -72,9 +72,9 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options, STARPU_VALUE, &N, sizeof(int), STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_W, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 75af260d6f33289a32396d00d03a36abb1e09b79..fb2db26e4d978a4018dcf0d65fe3f47689c10004 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -9,11 +9,11 @@ * * @brief Chameleon ztpmlqt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> s d c * */ @@ -118,10 +118,10 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options, STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(V, ChamComplexDouble, Vm, Vn), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 7d266758adb40234a7f0b52eb8f257514473020c..bab1d8d0a7dfc41c74edd98b1143cc62ebd8330b 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -9,11 +9,11 @@ * * @brief Chameleon ztpmqrt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> s d c * */ @@ -118,10 +118,10 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options, STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &(options->ws_wsize), sizeof(size_t), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(V, ChamComplexDouble, Vm, Vn), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c index 6f9983b033dbbd027143c93847cd49419f101a3e..0ade7095a124201ccddb2b39ef804d355ecb0ee7 100644 --- a/runtime/starpu/codelets/codelet_ztpqrt.c +++ b/runtime/starpu/codelets/codelet_ztpqrt.c @@ -11,12 +11,12 @@ * * @brief Chameleon ztpqrt StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Florent Pruvost * @author Lucas Barros de Assis * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> s d c * */ @@ -73,9 +73,9 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options, STARPU_VALUE, &N, sizeof(int), STARPU_VALUE, &L, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_W, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_W, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), /* Other options */ STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index 66fc4da3ee1e6ee84595881561b6157c6c5e2b07..57b146d6a70ad2a93a455762cae0e5c01d16856a 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -98,8 +98,8 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, &cl_ztradd, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_ztradd_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + accessB, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztrasm.c b/runtime/starpu/codelets/codelet_ztrasm.c index f0e8cfb8318879c91a8ebf33fefb2d45b63c9534..c5412d41679a63c779aee6aa63a6a02c5ac2caf7 100644 --- a/runtime/starpu/codelets/codelet_ztrasm.c +++ b/runtime/starpu/codelets/codelet_ztrasm.c @@ -11,14 +11,14 @@ * * @brief Chameleon ztrasm StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -68,8 +68,8 @@ void INSERT_TASK_ztrasm( const RUNTIME_option_t *options, STARPU_VALUE, &diag, sizeof(int), STARPU_VALUE, &M, sizeof(int), STARPU_VALUE, &N, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, double, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamRealDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 36b31640cc56bf85d19a31900af7aab3d4ca43c0..a5f2a18773fbb964db65b9e28573c7bd85c945e8 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -148,8 +148,8 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, &cl_ztrmm, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrmm_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 901eeb7809ec81505559b2c3e83b4f7a4c7e1bb3..00d38429e460dd32999e85d3e9b882e5d57acdd8 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -152,8 +152,8 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, &cl_ztrsm, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrsm_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztrssq.c b/runtime/starpu/codelets/codelet_ztrssq.c index 0ca2119967510ee77a944ada7bca023cc68d0ebc..9d800319438bd57f759305c4f2553ccbbbbffcef 100644 --- a/runtime/starpu/codelets/codelet_ztrssq.c +++ b/runtime/starpu/codelets/codelet_ztrssq.c @@ -11,13 +11,13 @@ * * @brief Chameleon ztrssq StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.6.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -66,8 +66,8 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options, STARPU_VALUE, &diag, sizeof(int), STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_RW, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_RW, RTBLKADDR(SCALESUMSQ, ChamRealDouble, SCALESUMSQm, SCALESUMSQn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 19499f058f32c493d6be3216439f18e84de15d0e..4918849af0fe53409ef0530160767dc3794bdc03 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -96,7 +96,7 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, &cl_ztrtri, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrtri_args_s), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c index 55cb574df3cd1d6c703811c5690601874702e9e3..05510b05943b0858de0910e80cbe6dd8df96102e 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c @@ -11,14 +11,14 @@ * * @brief Chameleon ztsmlq_hetra1 StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Mathieu Faverge * @author Azzam Haidar * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -96,10 +96,10 @@ void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options, STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &ldWORK, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A1, ChamComplexDouble, A1m, A1n), + STARPU_RW, RTBLKADDR(A2, ChamComplexDouble, A2m, A2n), + STARPU_R, RTBLKADDR(V, ChamComplexDouble, Vm, Vn), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c index acdb1db4dfcbcf8465f1ce76aedda65d72da1860..be370a8878c703fe08491a19d8c3cff62b79f315 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c +++ b/runtime/starpu/codelets/codelet_ztsmqr_hetra1.c @@ -11,14 +11,14 @@ * * @brief Chameleon ztsmqr_hetra1 StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Hatem Ltaief * @author Mathieu Faverge * @author Azzam Haidar * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -96,10 +96,10 @@ void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options, STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &ldWORK, sizeof(int), - STARPU_RW, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), - STARPU_RW, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), - STARPU_R, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), + STARPU_RW, RTBLKADDR(A1, ChamComplexDouble, A1m, A1n), + STARPU_RW, RTBLKADDR(A2, ChamComplexDouble, A2m, A2n), + STARPU_R, RTBLKADDR(V, ChamComplexDouble, Vm, Vn), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), STARPU_SCRATCH, options->ws_worker, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, diff --git a/runtime/starpu/codelets/codelet_ztstrf.c b/runtime/starpu/codelets/codelet_ztstrf.c index 1ad32614ed03fd075dcb3231dab2228d5c22d6e4..4839c0d5abd627dea9c64df0a6d0ec972edd0ac1 100644 --- a/runtime/starpu/codelets/codelet_ztstrf.c +++ b/runtime/starpu/codelets/codelet_ztstrf.c @@ -11,7 +11,7 @@ * * @brief Chameleon ztstrf StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -96,9 +96,9 @@ void INSERT_TASK_ztstrf( const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &ib, sizeof(int), STARPU_VALUE, &nb, sizeof(int), - STARPU_RW, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un), - STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(L, CHAMELEON_Complex64_t, Lm, Ln), + STARPU_RW, RTBLKADDR(U, ChamComplexDouble, Um, Un), + STARPU_RW, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(L, ChamComplexDouble, Lm, Ln), STARPU_VALUE, &IPIV, sizeof(int*), STARPU_SCRATCH, options->ws_worker, STARPU_VALUE, &d_work, sizeof(CHAMELEON_starpu_ws_t *), diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 229beb87ec96132d3ab5cf2a318e531e7e8c0c08..db6b3726ee0375fa0055c64d7f61067989c438f9 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -11,7 +11,7 @@ * * @brief Chameleon zunmlq StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -23,7 +23,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -118,9 +118,9 @@ void INSERT_TASK_zunmlq( const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_RW, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), + STARPU_RW, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* ib * nb */ STARPU_SCRATCH, options->ws_worker, STARPU_VALUE, &nb, sizeof(int), diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 767d741b907d4aeb587484a5a54a4fc280e34f4a..657c6ca87230410ac7ed7fe6e72435ed31fec886 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -11,7 +11,7 @@ * * @brief Chameleon zunmqr StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @comment This file has been automatically generated * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Hatem Ltaief @@ -22,7 +22,7 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -117,9 +117,9 @@ void INSERT_TASK_zunmqr( const RUNTIME_option_t *options, STARPU_VALUE, &n, sizeof(int), STARPU_VALUE, &k, sizeof(int), STARPU_VALUE, &ib, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn), - STARPU_RW, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_R, RTBLKADDR(T, ChamComplexDouble, Tm, Tn), + STARPU_RW, RTBLKADDR(C, ChamComplexDouble, Cm, Cn), /* ib * nb */ STARPU_SCRATCH, options->ws_worker, STARPU_VALUE, &nb, sizeof(int), diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 36dda081062b9840531622c0c46e4de29a1eaaf1..ee4817fe4cc35abf5a73baffad8b224fcf79452a 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -11,7 +11,7 @@ * * @brief Chameleon StarPU descriptor routines * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede @@ -20,7 +20,7 @@ * @author Raphael Boucherie * @author Samuel Thibault * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-06 * */ #include "chameleon_starpu.h" @@ -101,6 +101,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) { int64_t lmt = desc->lmt; int64_t lnt = desc->lnt; + size_t nbtiles = lmt * lnt; desc->occurences = 1; @@ -108,8 +109,12 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) * Allocate starpu_handle_t array (handlers are initialized on the fly when * discovered by any algorithm to save space) */ - desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t)); - assert(desc->schedopt); + if ( cham_is_mixed( desc->dtyp ) ) { + nbtiles *= 3; + } + + desc->schedopt = (void*)calloc( nbtiles, sizeof(starpu_data_handle_t) ); + assert( desc->schedopt ); #if !defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) @@ -160,7 +165,7 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) */ { chameleon_starpu_tag_init(); - desc->mpitag = chameleon_starpu_tag_book( (int64_t)lnt * (int64_t)lmt ); + desc->mpitag = chameleon_starpu_tag_book( nbtiles ); if ( desc->mpitag == -1 ) { chameleon_fatal_error("RUNTIME_desc_create", "Can't pursue computation since no more tags are available"); @@ -181,41 +186,45 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc ) * If this is the last descriptor using the matrix, we release the handle * and unregister the GPU data */ - if ( desc->occurences == 0 ) { - starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); - int lmt = desc->lmt; - int lnt = desc->lnt; - int m, n; - - for (n = 0; n < lnt; n++) { - for (m = 0; m < lmt; m++) - { - if ( *handle != NULL ) { - starpu_data_unregister(*handle); - *handle = NULL; - } - handle++; - } + if ( desc->occurences > 0 ) { + return; + } + + starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt); + int64_t lmt = desc->lmt; + int64_t lnt = desc->lnt; + int64_t nbtiles = lmt * lnt; + int64_t m; + + if ( cham_is_mixed( desc->dtyp ) ) { + nbtiles *= 3; + } + + for (m = 0; m < nbtiles; m++, handle++) + { + if ( *handle != NULL ) { + starpu_data_unregister(*handle); + *handle = NULL; } + } #if !defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) - if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) + { + /* Unmap the pinned memory associated to the matrix */ + if (gpuHostUnregister(desc->mat) != gpuSuccess) { - /* Unmap the pinned memory associated to the matrix */ - if (gpuHostUnregister(desc->mat) != gpuSuccess) - { - chameleon_warning("RUNTIME_desc_destroy(StarPU)", - "gpuHostUnregister failed to unregister the " - "pinned memory associated to the matrix"); - } + chameleon_warning("RUNTIME_desc_destroy(StarPU)", + "gpuHostUnregister failed to unregister the " + "pinned memory associated to the matrix"); } + } #endif #endif - chameleon_starpu_tag_release( desc->mpitag ); + chameleon_starpu_tag_release( desc->mpitag ); - free( desc->schedopt ); - } + free( desc->schedopt ); } /** @@ -335,24 +344,37 @@ void RUNTIME_desc_flush( const CHAM_desc_t *desc, void RUNTIME_data_flush( const RUNTIME_sequence_t *sequence, const CHAM_desc_t *A, int m, int n ) { + int local, i, imax = 1; int64_t mm = m + (A->i / A->mb); int64_t nn = n + (A->j / A->nb); - int64_t shift = ((int64_t)A->lmt) * nn + mm; + int64_t shift = ((int64_t)A->lmt) * nn + mm; + int64_t nbtiles = ((int64_t)(A->lmt)) * ((int64_t)(A->lnt)); starpu_data_handle_t *handle = A->schedopt; handle += shift; - if (*handle == NULL) { - return; - } + local = chameleon_desc_islocal( A, m, n ); + + if ( cham_is_mixed( A->dtyp ) ) { + imax = 3; + } + + for( i=0; i<imax; i++ ) { + starpu_data_handle_t *handlebis; + + handlebis = handle + i * nbtiles; + + if ( *handlebis == NULL ) { + continue; + } #if defined(CHAMELEON_USE_MPI) - starpu_mpi_cache_flush( MPI_COMM_WORLD, *handle ); + starpu_mpi_cache_flush( MPI_COMM_WORLD, *handlebis ); #endif - if ( chameleon_desc_islocal( A, m, n ) ) { - chameleon_starpu_data_wont_use( *handle ); + if ( local ) { + chameleon_starpu_data_wont_use( *handlebis ); + } } - (void)sequence; } @@ -443,3 +465,71 @@ void *RUNTIME_data_getaddr( const CHAM_desc_t *A, int m, int n ) assert( *ptrtile ); return (void*)(*ptrtile); } + +void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, + cham_access_t access, cham_flttype_t flttype, + const CHAM_desc_t *A, int m, int n ) +{ + int64_t mm = m + (A->i / A->mb); + int64_t nn = n + (A->j / A->nb); + + CHAM_tile_t *tile = A->get_blktile( A, m, n ); + starpu_data_handle_t *ptrtile = A->schedopt; + + int fltshift = (cham_get_arith( tile->flttype ) - cham_get_arith( flttype ) + 3 ) % 3; + int64_t shift = (int64_t)fltshift * ((int64_t)A->lmt * (int64_t)A->lnt); + shift = shift + ((int64_t)A->lmt) * nn + mm; + + /* Get the correct starpu_handle */ + ptrtile += shift; + + if ( *ptrtile != NULL ) { + return (void*)(*ptrtile); + } + + int home_node = -1; + int myrank = A->myrank; + int owner = A->get_rankof( A, m, n ); + + if ( myrank == owner ) { + if ( (tile->format & CHAMELEON_TILE_HMAT) || + (tile->mat != NULL) ) + { + home_node = STARPU_MAIN_RAM; + } + } + + starpu_cham_tile_register( ptrtile, home_node, tile, flttype ); + +#if defined(HAVE_STARPU_DATA_SET_OOC_FLAG) + if ( A->ooc == 0 ) { + starpu_data_set_ooc_flag( *ptrtile, 0 ); + } +#endif + +#if defined(HAVE_STARPU_DATA_SET_COORDINATES) + starpu_data_set_coordinates( *ptrtile, 3, m, n, cham_get_arith( flttype ) ); +#endif + +#if defined(CHAMELEON_USE_MPI) + starpu_mpi_data_register( *ptrtile, A->mpitag + shift, owner ); +#endif /* defined(CHAMELEON_USE_MPI) */ + +#if defined(CHAMELEON_KERNELS_TRACE) + fprintf( stderr, "%s - %p registered with tag %ld\n", + tile->name, *ptrtile, A->mpitag + shift ); +#endif + assert( *ptrtile ); + + /* Submit the data conversion */ + if (( fltshift != 0 ) && (access & ChamR) && (owner == myrank) ) { + starpu_data_handle_t *fromtile = A->schedopt; + starpu_data_handle_t *totile = ptrtile; + + fromtile += ((int64_t)A->lmt) * nn + mm; + if ( *fromtile != NULL ) { + insert_task_convert( options, tile->m, tile->n, tile->flttype, *fromtile, flttype, *totile ); + } + } + return (void*)(*ptrtile); +} diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 8e77d14a9dea84153c6b0672d63cecb3c71ed077..8d421ddbb5234d073ba9ea71b3c97ff3046ff9fc 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -116,6 +116,10 @@ static inline int cham_to_starpu_access( cham_access_t accessA ) { return accessA; } +void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, + cham_access_t access, cham_flttype_t flttype, + const CHAM_desc_t *A, int m, int n ); + /* * MPI Redefinitions */ diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 14f7c4e20c4b25ce01491f4274a25c721cc122ee..db7d35daca01a8b1b7fc16abf17fb742dda01e65 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -11,14 +11,14 @@ * * @brief Chameleon StarPU CHAMELEON_Complex64_t codelets header * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Alycia Lisito * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -33,14 +33,6 @@ #include "coreblas/coreblas_z.h" #include "coreblas/coreblas_ztile.h" -#if defined(CHAMELEON_USE_CUDA) -#include "gpucublas.h" -#endif - -#if defined(CHAMELEON_USE_HIP) -#include "gpuhipblas.h" -#endif - #endif /* !defined(CHAMELEON_SIMULATION) */ /* @@ -98,6 +90,10 @@ CODELETS_HEADER(zunmqr); * Auxiliary functions */ CODELETS_HEADER(zgeadd); +#if defined(PRECISION_z) || defined(PRECISION_d) +CODELETS_HEADER(zgered); +#endif +CODELETS_HEADER(zhessq); CODELETS_HEADER(zhe2ge); CODELETS_HEADER(zlascal); CODELETS_HEADER(ztradd); @@ -142,4 +138,19 @@ CODELETS_HEADER(zsytrf_nopiv); #endif CODELETS_HEADER(zplgsy); +#if defined(PRECISION_d) || defined(PRECISION_s) +CODELETS_HEADER(dlag2h); +CODELETS_HEADER(hlag2d); +#endif + +struct cl_zgemm_args_s { + cham_trans_t transA; + cham_trans_t transB; + int m; + int n; + int k; + CHAMELEON_Complex64_t alpha; + CHAMELEON_Complex64_t beta; +}; + #endif /* _runtime_codelet_z_h_ */ diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index 00808fc52fec20078744d3603bec4cb2be680a24..c27d6b913bb231c4815dca09e67b7201e12697c7 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -11,13 +11,13 @@ * * @brief Chameleon StarPU codelets main header * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-06 * */ #ifndef _runtime_codelets_h_ @@ -26,6 +26,16 @@ #include "chameleon/config.h" #include "runtime_codelet_profile.h" +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +#include "gpucublas.h" +#endif + +#if defined(CHAMELEON_USE_HIP) +#include "gpuhipblas.h" +#endif +#endif /* !defined(CHAMELEON_SIMULATION) */ + #if defined(STARPU_CUDA_ASYNC) #define CODELET_CUDA_FLAGS(flags) .cuda_flags = {(flags)}, #else @@ -141,5 +151,25 @@ #endif CODELETS_HEADER(map); +CODELETS_HEADER(hgemm); +CODELETS_HEADER(gemmex); + +struct cl_hgemm_args_s { + cham_trans_t transA; + cham_trans_t transB; + int m; + int n; + int k; + CHAMELEON_Real16_t alpha; + CHAMELEON_Real16_t beta; +}; + +void +insert_task_convert( const RUNTIME_option_t *options, + int m, int n, + cham_flttype_t fromtype, + starpu_data_handle_t fromtile, + cham_flttype_t totype, + starpu_data_handle_t totile ); #endif /* _runtime_codelets_h_ */ diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index 27eb801cb266a0038e77b19e04e729b68b3270f4..1e837e1b0ca4c4a5e8418e9acf86dcfb71029851 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -672,6 +672,8 @@ starpu_cham_tile_register( starpu_data_handle_t *handleptr, .tilesize = tile->m * tile->n * elemsize, }; memcpy( &(cham_tile_interface.tile), tile, sizeof( CHAM_tile_t ) ); + /* Overwrite the flttype in case it comes from a data conversion */ + cham_tile_interface.tile.flttype = flttype; if ( tile->format & CHAMELEON_TILE_FULLRANK ) { cham_tile_interface.allocsize = tile->m * tile->n * elemsize;