diff --git a/compute/pzgemm.c b/compute/pzgemm.c index 4c41e64438be82566417209028177a56ac26950a..6d98e855dd3be370cbcbb014fdc46962b3a84180 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -24,15 +24,15 @@ */ #include "control/common.h" -#define A(m, n) A, m, n -#define B(m, n) B, m, n -#define C(m, n) C, m, n -#define WA(m, n) WA, m, n -#define WB(m, n) WB, m, n +#define A( _m_, _n_ ) A, (_m_), (_n_) +#define B( _m_, _n_ ) B, (_m_), (_n_) +#define C( _m_, _n_ ) C, (_m_), (_n_) +#define WA( _m_, _n_ ) WA, (_m_), (_n_) +#define WB( _m_, _n_ ) WB, (_m_), (_n_) /** * Parallel tile matrix-matrix multiplication. - * Generic algorithm for any data distribution. + * Generic algorithm for any data distribution with a stationnary A. */ static inline void chameleon_pzgemm_Astat( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB, diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c index b50edf517487e469b41f59d3113408d631cea0f6..4d8126f5c5150febf6cd7bc8f5d161f1a92f8010 100644 --- a/compute/pzgepdf_qdwh.c +++ b/compute/pzgepdf_qdwh.c @@ -818,7 +818,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t switch( mtxtype ) { #if defined(PRECISION_z) || defined(PRECISION_c) case ChamHermitian: - chameleon_pzhemm( ChamRight, ChamUpper, + chameleon_pzhemm( gemm_ws, ChamRight, ChamUpper, 1., descU, &descA, 0., descH, sequence, request ); if ( info ) { @@ -827,7 +827,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t break; #endif case ChamSymmetric: - chameleon_pzsymm( ChamRight, ChamUpper, + chameleon_pzsymm( gemm_ws, ChamRight, ChamUpper, 1., descU, &descA, 0., descH, sequence, request ); if ( info ) { diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 75f1ab66be514b1eaf22f0f47efd8321cbcebc31..24f8bb2be7ce21c1db45d8582c0ef2d1572d3930 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -30,6 +30,257 @@ #define WA( _m_, _n_ ) WA, (_m_), (_n_) #define WB( _m_, _n_ ) WB, (_m_), (_n_) +/** + * Parallel tile matrix-matrix multiplication. + * Generic algorithm for any data distribution with a stationnary A. + * + * Assuming A has been setup with a proper getrank function to account for symmetry + */ +static inline void +chameleon_pzhemm_Astat( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + RUNTIME_sequence_t *sequence = options->sequence; + int k, m, n, l, Am, An; + int tempmm, tempnn, tempkn, tempkm; + int myrank = RUNTIME_comm_rank( chamctxt ); + int reduceC[ C->mt * C->nt ]; + + /* Set C tiles to redux mode */ + for (n = 0; n < C->nt; n++) { + for (m = 0; m < C->mt; m++) { + reduceC[ n * C->mt + m ] = 0; + + /* The node owns the C tile. */ + if ( C->get_rankof( C(m, n) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + continue; + } + + /* + * The node owns the A tile that will define the locality of the + * computations. + */ + /* Select row or column based on side */ + l = ( side == ChamLeft ) ? m : n; + + if ( uplo == ChamLower ) { + for (k = 0; k < A->mt; k++) { + Am = k; + An = k; + + if (k < l) { + Am = l; + } + else if (k > l) { + An = l; + } + + if ( A->get_rankof( A( Am, An ) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + else { + for (k = 0; k < A->mt; k++) { + Am = k; + An = k; + + if (k < l) { + An = l; + } + else if (k > l) { + Am = l; + } + + if ( A->get_rankof( A( Am, An ) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + } + } + + for(n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for(m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + + /* Scale C */ + options->forcesub = 0; + INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb, + beta, C, m, n ); + options->forcesub = reduceC[ n * C->mt + m ]; + + /* + * ChamLeft / ChamLower + */ + /* Select row or column based on side */ + l = ( side == ChamLeft ) ? m : n; + + if (side == ChamLeft) { + if (uplo == ChamLower) { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + + if (k < m) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == m) { + INSERT_TASK_zhemm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + /* + * ChamLeft / ChamUpper + */ + else { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + + if (k < m) { + INSERT_TASK_zgemm_Astat( + options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == m) { + INSERT_TASK_zhemm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + } + /* + * ChamRight / ChamLower + */ + else { + if (uplo == ChamLower) { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + + if (k < n) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == n) { + INSERT_TASK_zhemm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + /* + * ChamRight / ChamUpper + */ + else { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + + if (k < n) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == n) { + INSERT_TASK_zhemm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + } + + RUNTIME_zgersum_submit_tree( options, C(m, n) ); + RUNTIME_data_flush( sequence, C(m, n) ); + } + } + options->forcesub = 0; + (void)chamctxt; +} + + /** * Parallel tile hermitian matrix-matrix multiplication. * SUMMA algorithm for 2D block-cyclic data distribution. @@ -310,39 +561,22 @@ static inline void chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - CHAM_desc_t WA, WB; - int lookahead; - - lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); if (side == ChamLeft) { chameleon_pzhemm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } else { chameleon_pzhemm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } - RUNTIME_desc_flush( &WA, sequence ); - RUNTIME_desc_flush( &WB, sequence ); - RUNTIME_desc_flush( C, sequence ); - chameleon_sequence_wait( chamctxt, sequence ); - chameleon_desc_destroy( &WA ); - chameleon_desc_destroy( &WB ); + CHAMELEON_Desc_Flush( WA, sequence ); + CHAMELEON_Desc_Flush( WB, sequence ); } /** @@ -530,13 +764,15 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * Parallel tile hermitian matrix-matrix multiplication. wrapper. */ void -chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, +chameleon_pzhemm( struct chameleon_pzgemm_s *ws, + cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -544,15 +780,26 @@ chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - if ( ((C->p > 1) || (C->q > 1)) && - (C->get_rankof == chameleon_getrankof_2d) && - (chamctxt->generic_enabled != CHAMELEON_TRUE) ) - { - chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); - } - else - { + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: + chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, + &(ws->WA), &(ws->WB), &options ); + break; + + case ChamGemmAlgSummaA: + if ( side == ChamLeft ) { + chameleon_pzhemm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + break; } RUNTIME_options_finalize( &options, chamctxt ); diff --git a/compute/pzsymm.c b/compute/pzsymm.c index 2f2fa1997472bc2fc56002b5bc7cf1df12d48676..e39d9f6e298a42be3b8516121cbed6c14afaee3c 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -30,6 +30,257 @@ #define WA( _m_, _n_ ) WA, (_m_), (_n_) #define WB( _m_, _n_ ) WB, (_m_), (_n_) +/** + * Parallel tile matrix-matrix multiplication. + * Generic algorithm for any data distribution with a stationnary A. + * + * Assuming A has been setup with a proper getrank function to account for symmetry + */ +static inline void +chameleon_pzsymm_Astat( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + RUNTIME_sequence_t *sequence = options->sequence; + int k, m, n, l, Am, An; + int tempmm, tempnn, tempkn, tempkm; + int myrank = RUNTIME_comm_rank( chamctxt ); + int reduceC[ C->mt * C->nt ]; + + /* Set C tiles to redux mode */ + for (n = 0; n < C->nt; n++) { + for (m = 0; m < C->mt; m++) { + reduceC[ n * C->mt + m ] = 0; + + /* The node owns the C tile. */ + if ( C->get_rankof( C(m, n) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + continue; + } + + /* + * The node owns the A tile that will define the locality of the + * computations. + */ + /* Select row or column based on side */ + l = ( side == ChamLeft ) ? m : n; + + if ( uplo == ChamLower ) { + for (k = 0; k < A->mt; k++) { + Am = k; + An = k; + + if (k < l) { + Am = l; + } + else if (k > l) { + An = l; + } + + if ( A->get_rankof( A( Am, An ) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + else { + for (k = 0; k < A->mt; k++) { + Am = k; + An = k; + + if (k < l) { + An = l; + } + else if (k > l) { + Am = l; + } + + if ( A->get_rankof( A( Am, An ) ) == myrank ) { + reduceC[ n * C->mt + m ] = 1; + RUNTIME_zgersum_set_methods( C(m, n) ); + break; + } + } + } + } + } + + for(n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + for(m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + + /* Scale C */ + options->forcesub = 0; + INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb, + beta, C, m, n ); + options->forcesub = reduceC[ n * C->mt + m ]; + + /* + * ChamLeft / ChamLower + */ + /* Select row or column based on side */ + l = ( side == ChamLeft ) ? m : n; + + if (side == ChamLeft) { + if (uplo == ChamLower) { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + + if (k < m) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == m) { + INSERT_TASK_zsymm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + /* + * ChamLeft / ChamUpper + */ + else { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + + if (k < m) { + INSERT_TASK_zgemm_Astat( + options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), /* ldak * X */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == m) { + INSERT_TASK_zsymm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(m, k), /* lda * K */ + B(k, n), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + } + /* + * ChamRight / ChamLower + */ + else { + if (uplo == ChamLower) { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + + if (k < n) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == n) { + INSERT_TASK_zsymm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + /* + * ChamRight / ChamUpper + */ + else { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + + if (k < n) { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamNoTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(k, n), /* ldak * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else if (k == n) { + INSERT_TASK_zsymm_Astat( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), /* ldak * Y */ + B(m, k), /* ldb * Y */ + zone, C(m, n)); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm_Astat( + options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), /* ldb * K */ + A(n, k), /* lda * K */ + zone, C(m, n)); /* ldc * Y */ + } + } + } + } + + RUNTIME_zgersum_submit_tree( options, C(m, n) ); + RUNTIME_data_flush( sequence, C(m, n) ); + } + } + options->forcesub = 0; + (void)chamctxt; +} + + /** * Parallel tile symmetric matrix-matrix multiplication. * SUMMA algorithm for 2D block-cyclic data distribution. @@ -310,39 +561,22 @@ static inline void chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - CHAM_desc_t WA, WB; - int lookahead; - - lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); if (side == ChamLeft) { chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } else { chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, - &WA, &WB, options ); + WA, WB, options ); } - RUNTIME_desc_flush( &WA, sequence ); - RUNTIME_desc_flush( &WB, sequence ); - RUNTIME_desc_flush( C, sequence ); - chameleon_sequence_wait( chamctxt, sequence ); - chameleon_desc_destroy( &WA ); - chameleon_desc_destroy( &WB ); + CHAMELEON_Desc_Flush( WA, sequence ); + CHAMELEON_Desc_Flush( WB, sequence ); } /** @@ -530,13 +764,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_ * Parallel tile symmetric matrix-matrix multiplication. wrapper. */ void -chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, +chameleon_pzsymm( struct chameleon_pzgemm_s *ws, + cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; + cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric; chamctxt = chameleon_context_self(); if (sequence->status != CHAMELEON_SUCCESS) { @@ -544,15 +780,26 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, } RUNTIME_options_init( &options, chamctxt, sequence, request ); - if ( ((C->p > 1) || (C->q > 1)) && - (C->get_rankof == chameleon_getrankof_2d) && - (chamctxt->generic_enabled != CHAMELEON_TRUE) ) - { - chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); - } - else - { + switch( alg ) { + case ChamGemmAlgAuto: + case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */ + case ChamGemmAlgGeneric: chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + break; + + case ChamGemmAlgSummaC: + chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, + &(ws->WA), &(ws->WB), &options ); + break; + + case ChamGemmAlgSummaA: + if ( side == ChamLeft ) { + chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + break; } RUNTIME_options_finalize( &options, chamctxt ); diff --git a/compute/zgemm.c b/compute/zgemm.c index 325f27cd90dc3cbf8dee13f26020f613e98a501a..9ce47be609ea6784527cba041a1cefbead3409e7 100644 --- a/compute/zgemm.c +++ b/compute/zgemm.c @@ -90,8 +90,8 @@ */ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA __attribute__((unused)), cham_trans_t transB __attribute__((unused)), - const CHAM_desc_t *A __attribute__((unused)), - const CHAM_desc_t *B __attribute__((unused)), + const CHAM_desc_t *A, + const CHAM_desc_t *B, const CHAM_desc_t *C ) { CHAM_context_t *chamctxt; @@ -301,7 +301,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws ) */ int CHAMELEON_zgemm( cham_trans_t transA, cham_trans_t transB, int M, int N, int K, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, - CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t *B, int LDB, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) { int NB; @@ -496,7 +496,6 @@ int CHAMELEON_zgemm_Tile( cham_trans_t transA, cham_trans_t transB, CHAMELEON_Desc_Flush( C, sequence ); chameleon_sequence_wait( chamctxt, sequence ); - CHAMELEON_zgemm_WS_Free( ws ); status = sequence->status; diff --git a/compute/zhemm.c b/compute/zhemm.c index 422bf9eecef6ea09acdff520e99751f5560b7e01..23c7fdf985ab84b2e604ca5dec633556ff35e68c 100644 --- a/compute/zhemm.c +++ b/compute/zhemm.c @@ -24,6 +24,181 @@ */ #include "control/common.h" +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Allocate the required workspaces for asynchronous hemm + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the hermitian matrix A appears on the + * left or right in the operation as follows: + * = ChamLeft: \f[ C = \alpha \times A \times B + \beta \times C \f] + * = ChamRight: \f[ C = \alpha \times B \times A + \beta \times C \f] + * + * @param[in] uplo + * Specifies whether the upper or lower triangular part of + * the hermitian matrix A is to be referenced as follows: + * = ChamLower: Only the lower triangular part of the + * hermitian matrix A is to be referenced. + * = ChamUpper: Only the upper triangular part of the + * hermitian matrix A is to be referenced. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] B + * The descriptor of the matrix B. + * + * @param[in] C + * The descriptor of the matrix C. + * + ******************************************************************************* + * + * @retval An allocated opaque pointer to use in CHAMELEON_zhemm_Tile_Async() + * and to free with CHAMELEON_zhemm_WS_Free(). + * + ******************************************************************************* + * + * @sa CHAMELEON_zhemm_Tile_Async + * @sa CHAMELEON_zhemm_WS_Free + * + */ +void *CHAMELEON_zhemm_WS_Alloc( cham_side_t side __attribute__((unused)), + cham_uplo_t uplo __attribute__((unused)), + const CHAM_desc_t *A, + const CHAM_desc_t *B, + const CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *options; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + return NULL; + } + + options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } + + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) + { + int lookahead = chamctxt->lookahead; + + chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + } + + return (void*)options; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Free the allocated workspaces for asynchronous hemm + * + ******************************************************************************* + * + * @param[in,out] user_ws + * On entry, the opaque pointer allocated by CHAMELEON_zhemm_WS_Alloc() + * On exit, all data are freed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zhemm_Tile_Async + * @sa CHAMELEON_zhemm_WS_Alloc + * + */ +void CHAMELEON_zhemm_WS_Free( void *user_ws ) +{ + struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; + + if ( ws->alg == ChamGemmAlgSummaC ) { + chameleon_desc_destroy( &(ws->WA) ); + chameleon_desc_destroy( &(ws->WB) ); + } + free( ws ); +} + /** ******************************************************************************** * @@ -102,9 +277,9 @@ * */ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, - CHAMELEON_Complex64_t *B, int LDB, - CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) { int NB; int Am; @@ -115,6 +290,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -158,7 +334,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, ((alpha == (CHAMELEON_Complex64_t)0.0) && beta == (CHAMELEON_Complex64_t)1.0)) return CHAMELEON_SUCCESS; - /* Tune NB depending on M, N & NRHS; Set NBNBSIZE */ + /* Tune NB depending on M, N & NRHS; Set NBNB */ status = chameleon_tune(CHAMELEON_FUNC_ZHEMM, M, N, 0); if (status != CHAMELEON_SUCCESS) { chameleon_error("CHAMELEON_zhemm", "chameleon_tune() failed"); @@ -179,7 +355,8 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, C, NB, NB, LDC, N, M, N, sequence, &request ); /* Call the tile interface */ - CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request ); + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt ); + CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, @@ -192,6 +369,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ + CHAMELEON_zhemm_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt ); @@ -260,13 +438,14 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N, * */ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; int status; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -275,13 +454,16 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, } chameleon_sequence_create( chamctxt, &sequence ); - CHAMELEON_zhemm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request ); + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C ); + CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( B, sequence ); CHAMELEON_Desc_Flush( C, sequence ); chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zhemm_WS_Free( ws ); + status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); return status; @@ -316,11 +498,13 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo, * */ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + void *user_ws, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -391,16 +575,6 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, } /* Check submatrix starting point */ - /* if ( (B->i != C->i) || (B->j != C->j) ) { */ - /* chameleon_error("CHAMELEON_zhemm_Tile_Async", "B and C submatrices doesn't match"); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ - /* if ( (A->i != A->j) || */ - /* ( (side == ChamLeft) && (A->i != B->i ) ) || */ - /* ( (side == ChamRight) && (A->i != B->j ) ) ) { */ - /* chameleon_error("CHAMELEON_zhemm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ if( (A->i != 0) || (A->j != 0) || (B->i != 0) || (B->j != 0) || (C->i != 0) || (C->j != 0) ) { @@ -415,7 +589,21 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo, return CHAMELEON_SUCCESS; } - chameleon_pzhemm( side, uplo, alpha, A, B, beta, C, sequence, request ); + if ( user_ws == NULL ) { + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C ); + } + else { + ws = user_ws; + } + chameleon_pzhemm( ws, side, uplo, alpha, A, B, beta, C, sequence, request ); + + if ( user_ws == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zhemm_WS_Free( ws ); + } return CHAMELEON_SUCCESS; } diff --git a/compute/zsymm.c b/compute/zsymm.c index c4ff5fb0ea3f8cb1bf8bb0321f2f62f52d0d33bb..397c8f65c7f4387cc4a3fd106538a2fa58ab3325 100644 --- a/compute/zsymm.c +++ b/compute/zsymm.c @@ -24,6 +24,181 @@ */ #include "control/common.h" +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Allocate the required workspaces for asynchronous symm + * + ******************************************************************************* + * + * @param[in] side + * Specifies whether the symmetric matrix A appears on the + * left or right in the operation as follows: + * = ChamLeft: \f[ C = \alpha \times A \times B + \beta \times C \f] + * = ChamRight: \f[ C = \alpha \times B \times A + \beta \times C \f] + * + * @param[in] uplo + * Specifies whether the upper or lower triangular part of + * the symmetric matrix A is to be referenced as follows: + * = ChamLower: Only the lower triangular part of the + * symmetric matrix A is to be referenced. + * = ChamUpper: Only the upper triangular part of the + * symmetric matrix A is to be referenced. + * + * @param[in] A + * The descriptor of the matrix A. + * + * @param[in] B + * The descriptor of the matrix B. + * + * @param[in] C + * The descriptor of the matrix C. + * + ******************************************************************************* + * + * @retval An allocated opaque pointer to use in CHAMELEON_zsymm_Tile_Async() + * and to free with CHAMELEON_zsymm_WS_Free(). + * + ******************************************************************************* + * + * @sa CHAMELEON_zsymm_Tile_Async + * @sa CHAMELEON_zsymm_WS_Free + * + */ +void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side __attribute__((unused)), + cham_uplo_t uplo __attribute__((unused)), + const CHAM_desc_t *A, + const CHAM_desc_t *B, + const CHAM_desc_t *C ) +{ + CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *options; + + chamctxt = chameleon_context_self(); + if ( chamctxt == NULL ) { + return NULL; + } + + options = calloc( 1, sizeof(struct chameleon_pzgemm_s) ); + options->alg = ChamGemmAlgAuto; + + /* + * If only one process, or if generic has been globally enforced, we switch + * to generic immediately. + */ + if ( ((C->p == 1) && (C->q == 1)) || + (chamctxt->generic_enabled == CHAMELEON_TRUE) ) + { + options->alg = ChamGemmAlgGeneric; + } + + /* Look at environment variable is something enforces the variant. */ + if ( options->alg == ChamGemmAlgAuto ) + { + char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" ); + + if ( algostr ) { + if ( strcasecmp( algostr, "summa_c" ) == 0 ) { + options->alg = ChamGemmAlgSummaC; + } + else if ( strcasecmp( algostr, "summa_a" ) == 0 ) { + options->alg = ChamGemmAlgSummaA; + } + else if ( strcasecmp( algostr, "summa_b" ) == 0 ) { + options->alg = ChamGemmAlgSummaB; + } + else if ( strcasecmp( algostr, "generic" ) == 0 ) { + options->alg = ChamGemmAlgGeneric; + } + else if ( strcasecmp( algostr, "auto" ) == 0 ) { + options->alg = ChamGemmAlgAuto; + } + else { + fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" ); + } + } + chameleon_cleanenv( algostr ); + } + + /* Perform automatic choice if not already enforced. */ + if ( options->alg == ChamGemmAlgAuto ) + { + double sizeA, sizeB, sizeC; + double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */ + + /* Compute the average array per node for each matrix */ + sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q); + sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q); + sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio; + + if ( (sizeC > sizeA) && (sizeC > sizeB) ) { + options->alg = ChamGemmAlgSummaC; + } + else { + if ( sizeA > sizeB ) { + options->alg = ChamGemmAlgSummaA; + } + else { + options->alg = ChamGemmAlgSummaB; + } + } + } + + assert( options->alg != ChamGemmAlgAuto ); + + /* Now that we have decided which algorithm, let's allocate the required data structures. */ + if ( (options->alg == ChamGemmAlgSummaC ) && + (C->get_rankof == chameleon_getrankof_2d ) ) + { + int lookahead = chamctxt->lookahead; + + chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + } + + return (void*)options; +} + +/** + ******************************************************************************** + * + * @ingroup CHAMELEON_Complex64_t + * + * @brief Free the allocated workspaces for asynchronous symm + * + ******************************************************************************* + * + * @param[in,out] user_ws + * On entry, the opaque pointer allocated by CHAMELEON_zsymm_WS_Alloc() + * On exit, all data are freed. + * + ******************************************************************************* + * + * @sa CHAMELEON_zsymm_Tile_Async + * @sa CHAMELEON_zsymm_WS_Alloc + * + */ +void CHAMELEON_zsymm_WS_Free( void *user_ws ) +{ + struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws; + + if ( ws->alg == ChamGemmAlgSummaC ) { + chameleon_desc_destroy( &(ws->WA) ); + chameleon_desc_destroy( &(ws->WB) ); + } + free( ws ); +} + /** ******************************************************************************** * @@ -102,9 +277,9 @@ * */ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, - CHAMELEON_Complex64_t *B, int LDB, - CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA, + CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *C, int LDC ) { int NB; int Am; @@ -115,6 +290,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -179,7 +355,8 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, C, NB, NB, LDC, N, M, N, sequence, &request ); /* Call the tile interface */ - CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request ); + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt ); + CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request ); /* Submit the matrix conversion back */ chameleon_ztile2lap( chamctxt, &descAl, &descAt, @@ -192,6 +369,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, chameleon_sequence_wait( chamctxt, sequence ); /* Cleanup the temporary data */ + CHAMELEON_zsymm_WS_Free( ws ); chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt ); chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt ); chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt ); @@ -260,13 +438,14 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N, * */ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER; int status; + void *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -275,13 +454,16 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, } chameleon_sequence_create( chamctxt, &sequence ); - CHAMELEON_zsymm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request ); + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C ); + CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request ); CHAMELEON_Desc_Flush( A, sequence ); CHAMELEON_Desc_Flush( B, sequence ); CHAMELEON_Desc_Flush( C, sequence ); chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zsymm_WS_Free( ws ); + status = sequence->status; chameleon_sequence_destroy( chamctxt, sequence ); return status; @@ -316,11 +498,13 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo, * */ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + void *user_ws, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; + struct chameleon_pzgemm_s *ws; chamctxt = chameleon_context_self(); if (chamctxt == NULL) { @@ -391,16 +575,6 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, } /* Check submatrix starting point */ - /* if ( (B->i != C->i) || (B->j != C->j) ) { */ - /* chameleon_error("CHAMELEON_zsymm_Tile_Async", "B and C submatrices doesn't match"); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ - /* if ( (A->i != A->j) || */ - /* ( (side == ChamLeft) && (A->i != B->i ) ) || */ - /* ( (side == ChamRight) && (A->i != B->j ) ) ) { */ - /* chameleon_error("CHAMELEON_zsymm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */ - /* return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */ - /* } */ if( (A->i != 0) || (A->j != 0) || (B->i != 0) || (B->j != 0) || (C->i != 0) || (C->j != 0) ) { @@ -415,7 +589,21 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo, return CHAMELEON_SUCCESS; } - chameleon_pzsymm( side, uplo, alpha, A, B, beta, C, sequence, request ); + if ( user_ws == NULL ) { + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C ); + } + else { + ws = user_ws; + } + chameleon_pzsymm( ws, side, uplo, alpha, A, B, beta, C, sequence, request ); + + if ( user_ws == NULL ) { + CHAMELEON_Desc_Flush( A, sequence ); + CHAMELEON_Desc_Flush( B, sequence ); + CHAMELEON_Desc_Flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + CHAMELEON_zsymm_WS_Free( ws ); + } return CHAMELEON_SUCCESS; } diff --git a/control/chameleon_zf77.c b/control/chameleon_zf77.c index cf1b8d94ebd98bf55ae746818056e17a9e44d075..6132c2c9ec14cf6512a9f379c8a1dc3f2f14a534 100644 --- a/control/chameleon_zf77.c +++ b/control/chameleon_zf77.c @@ -757,7 +757,7 @@ void CHAMELEON_ZGETRS_NOPIV_TILE_ASYNC(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_s #if defined(PRECISION_z) || defined(PRECISION_c) void CHAMELEON_ZHEMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) -{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); } +{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); } #endif #if defined(PRECISION_z) || defined(PRECISION_c) @@ -840,7 +840,7 @@ void CHAMELEON_ZSYTRS_TILE_ASYNC(cham_uplo_t *uplo, CHAM_desc_t *A, CHAM_desc_t #endif void CHAMELEON_ZSYMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) -{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); } +{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); } void CHAMELEON_ZSYR2K_TILE_ASYNC(cham_uplo_t *uplo, cham_trans_t *trans, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info) { *info = CHAMELEON_zsyr2k_Tile_Async(*uplo, *trans, *alpha, A, B, *beta, C, sequence, request); } diff --git a/control/compute_z.h b/control/compute_z.h index 760127d31d5eab6d4339316c2888f36e6a535666..1ee61f62894310b1f41c442d47422c7729745e10 100644 --- a/control/compute_z.h +++ b/control/compute_z.h @@ -81,7 +81,7 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTI void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzgetrf_rectil(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzhegst(int itype, cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzhemm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *E, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); @@ -106,7 +106,7 @@ void chameleon_pzplrnk(int K, CHAM_desc_t *C, unsigned long long int seedA, unsi void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzshift(int, int, int, CHAMELEON_Complex64_t *, int *, int, int, int, RUNTIME_sequence_t*, RUNTIME_request_t*); -void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +void chameleon_pzsymm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h index 3a800ec85f90f186e2a802698d533b0adc073cbb..6e576d21b1e231fee7d66277cfc72311c14d093b 100644 --- a/include/chameleon/chameleon_z.h +++ b/include/chameleon/chameleon_z.h @@ -213,7 +213,7 @@ int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, RUNTIME_sequence_t *sequen //int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zgetrs_nopiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zherk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zher2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); //int CHAMELEON_zheev_Tile_Async(cham_job_t jobz, cham_uplo_t uplo, CHAM_desc_t *A, double *W, CHAM_desc_t *T, CHAMELEON_Complex64_t *Q, int LDQ, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); @@ -247,7 +247,7 @@ int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t int CHAMELEON_zpotrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsysv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsytrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); -int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); +int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsyrk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_zsyr2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request); int CHAMELEON_ztpgqrt_Tile_Async( int L, CHAM_desc_t *V1, CHAM_desc_t *T1, CHAM_desc_t *V2, CHAM_desc_t *T2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ); @@ -312,6 +312,10 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s */ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); void CHAMELEON_zgemm_WS_Free( void *ws ); +void *CHAMELEON_zhemm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); +void CHAMELEON_zhemm_WS_Free( void *ws ); +void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C ); +void CHAMELEON_zsymm_WS_Free( void *ws ); void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A ); void CHAMELEON_zcesca_WS_Free( void *ws ); void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A ); diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index ae307f760eb78d5ec4b3fec9a41db694211382be..4576ba9e44671fa4964b988d9f9a73e6bb9df312 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -110,6 +110,12 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); +void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, @@ -210,6 +216,12 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); +void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, diff --git a/runtime/openmp/codelets/codelet_zhemm.c b/runtime/openmp/codelets/codelet_zhemm.c index 79d36ff56f9c76074225a9b0438631236eab3b39..2d36c052c02632b2363f2ba6078e46839e4b84ea 100644 --- a/runtime/openmp/codelets/codelet_zhemm.c +++ b/runtime/openmp/codelets/codelet_zhemm.c @@ -20,23 +20,37 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zhemm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) TCORE_zhemm( side, uplo, - m, n, - alpha, tileA, - tileB, - beta, tileC ); + m, n, + alpha, tileA, + tileB, + beta, tileC ); (void)options; (void)nb; } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/openmp/codelets/codelet_zsymm.c b/runtime/openmp/codelets/codelet_zsymm.c index 90ea5d8490c741a38ff56536c32f25e37616f0c3..b89c945fb3cf882417048869873e71d33a21c3c6 100644 --- a/runtime/openmp/codelets/codelet_zsymm.c +++ b/runtime/openmp/codelets/codelet_zsymm.c @@ -20,23 +20,37 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zsymm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +void +INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) TCORE_zsymm( side, uplo, - m, n, - alpha, tileA, - tileB, - beta, tileC ); + m, n, + alpha, tileA, + tileB, + beta, tileC ); (void)options; (void)nb; } + +void +INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zsymm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/parsec/codelets/codelet_zgemm.c b/runtime/parsec/codelets/codelet_zgemm.c index 5a0938ed97e3dc116c18007b55897ddad0253c75..b831ebd09b00ceb5257e09fba345b851252edf26 100644 --- a/runtime/parsec/codelets/codelet_zgemm.c +++ b/runtime/parsec/codelets/codelet_zgemm.c @@ -23,11 +23,6 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_z.h" -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ static inline int CORE_zgemm_parsec( parsec_execution_stream_t *context, parsec_task_t *this_task ) @@ -50,20 +45,21 @@ CORE_zgemm_parsec( parsec_execution_stream_t *context, this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc ); CORE_zgemm( transA, transB, m, n, k, - alpha, A, lda, - B, ldb, - beta, C, ldc); + alpha, A, lda, + B, ldb, + beta, C, ldc ); (void)context; return PARSEC_HOOK_RETURN_DONE; } -void INSERT_TASK_zgemm( const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, +void +INSERT_TASK_zgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); @@ -71,7 +67,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); parsec_dtd_taskpool_insert_task( - PARSEC_dtd_taskpool, CORE_zgemm_parsec, options->priority, "Gemm", + PARSEC_dtd_taskpool, CORE_zgemm_parsec, options->priority, "gemm", sizeof(cham_trans_t), &transA, VALUE, sizeof(cham_trans_t), &transB, VALUE, sizeof(int), &m, VALUE, @@ -95,7 +91,7 @@ INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb, diff --git a/runtime/parsec/codelets/codelet_zhemm.c b/runtime/parsec/codelets/codelet_zhemm.c index a1c398308c514eaaea41be192c061bc42b09ac09..62ecccc26161921e93e832c6773a83284f6d76dc 100644 --- a/runtime/parsec/codelets/codelet_zhemm.c +++ b/runtime/parsec/codelets/codelet_zhemm.c @@ -43,20 +43,21 @@ CORE_zhemm_parsec( parsec_execution_stream_t *context, this_task, &side, &uplo, &M, &N, &alpha, &A, &LDA, &B, &LDB, &beta, &C, &LDC ); CORE_zhemm( side, uplo, M, N, - alpha, A, LDA, - B, LDB, - beta, C, LDC); + alpha, A, LDA, + B, LDB, + beta, C, LDC ); (void)context; return PARSEC_HOOK_RETURN_DONE; } -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); @@ -81,3 +82,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, (void)nb; } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/parsec/codelets/codelet_zsymm.c b/runtime/parsec/codelets/codelet_zsymm.c index cf8fc50cd9c3a2f3718e3be355a5ee731e65a656..d2868d4c11b2d3c9ee482ea4f1470616ab1cf908 100644 --- a/runtime/parsec/codelets/codelet_zsymm.c +++ b/runtime/parsec/codelets/codelet_zsymm.c @@ -24,7 +24,7 @@ static inline int CORE_zsymm_parsec( parsec_execution_stream_t *context, - parsec_task_t *this_task ) + parsec_task_t *this_task ) { cham_side_t side; cham_uplo_t uplo; @@ -45,18 +45,19 @@ CORE_zsymm_parsec( parsec_execution_stream_t *context, CORE_zsymm( side, uplo, M, N, alpha, A, LDA, B, LDB, - beta, C, LDC); + beta, C, LDC ); (void)context; return PARSEC_HOOK_RETURN_DONE; } -void INSERT_TASK_zsymm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); @@ -81,3 +82,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, (void)nb; } + +void +INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zsymm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c index d854c638a2f01e68018ca0973e67ce8a4d0604a6..1eb63bb7bfd66ec2a667357a1b4efcd7b2550da7 100644 --- a/runtime/quark/codelets/codelet_zgemm.c +++ b/runtime/quark/codelets/codelet_zgemm.c @@ -46,12 +46,13 @@ void CORE_zgemm_quark(Quark *quark) beta, tileC ); } -void INSERT_TASK_zgemm(const RUNTIME_option_t *options, - cham_trans_t transA, cham_trans_t transB, - int m, int n, int k, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zgemm( const RUNTIME_option_t *options, + cham_trans_t transA, cham_trans_t transB, + int m, int n, int k, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, @@ -81,7 +82,7 @@ INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, int m, int n, int k, int nb, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, + const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb, diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c index 9ab119755135f795ebc39031bd08795f5a59b07b..917376cedfdea194ae2c789808e78a92dad1f610 100644 --- a/runtime/quark/codelets/codelet_zhemm.c +++ b/runtime/quark/codelets/codelet_zhemm.c @@ -30,28 +30,28 @@ void CORE_zhemm_quark(Quark *quark) { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; - quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); - TCORE_zhemm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC); + TCORE_zhemm( side, uplo, + m, n, + alpha, tileA, tileB, + beta, tileC ); } -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, @@ -74,3 +74,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/quark/codelets/codelet_zsymm.c b/runtime/quark/codelets/codelet_zsymm.c index 6a57a49f9429384c38b6c03ce93984f503a3c744..a1cf3d43150d6faab68001cfdaffeba009120f40 100644 --- a/runtime/quark/codelets/codelet_zsymm.c +++ b/runtime/quark/codelets/codelet_zsymm.c @@ -30,28 +30,28 @@ void CORE_zsymm_quark(Quark *quark) { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; - quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); - TCORE_zsymm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC); + TCORE_zsymm( side, uplo, + m, n, + alpha, tileA, tileB, + beta, tileC ); } -void INSERT_TASK_zsymm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, @@ -74,3 +74,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } + +void +INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zsymm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index bf72c91c2cd697268438ac9641f977be4b77cd51..9e1574d7abd7fe95b0c47904aa44f83744f0e9a1 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -87,8 +87,6 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, handle ); - - return; } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -182,10 +180,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif - 0 ); - - (void)nb; } void INSERT_TASK_zgemm( const RUNTIME_option_t *options, @@ -260,8 +255,5 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif - 0 ); - - (void)nb; } diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index c05dbb92d53962d337b995f3725f02aeae19e31f..4451431f4c075feca2d6d6f6170098be3cf1d3cc 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -26,111 +26,229 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zhemm_args_s { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zhemm_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - TCORE_zhemm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + TCORE_zhemm( clargs->side, clargs->uplo, + clargs->m, clargs->n, + clargs->alpha, tileA, tileB, + clargs->beta, tileC ); } #ifdef CHAMELEON_USE_CUDA -static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) +static void +cl_zhemm_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_side_t side; - cham_uplo_t uplo; - int M; - int N; - cuDoubleComplex alpha; + struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); CUDA_zhemm( - side, uplo, - M, N, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, + clargs->side, clargs->uplo, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, handle ); } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + + struct cl_zhemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zhemm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_zhemm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zhemm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ + rt_starpu_insert_task( + &cl_zhemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zhemm; - void (*callback)(void*) = options->profiling ? cl_zhemm_callback : NULL; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_zhemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zhemm"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zhemm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zhemm_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zhemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zhemm", + STARPU_NAME, cl_name, #endif - 0); + 0 ); } diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index e1507f65791c4fe99dfab487a5f0ae2a2a05e96b..62b8de470b7c36f0cb82ecf826c0444707289d71 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -26,111 +26,229 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zsymm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zsymm_args_s { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zsymm_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - TCORE_zsymm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + TCORE_zsymm( clargs->side, clargs->uplo, + clargs->m, clargs->n, + clargs->alpha, tileA, tileB, + clargs->beta, tileC ); } #ifdef CHAMELEON_USE_CUDA -static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) +static void +cl_zsymm_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_side_t side; - cham_uplo_t uplo; - int M; - int N; - cuDoubleComplex alpha; + struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); CUDA_zsymm( - side, uplo, - M, N, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, + clargs->side, clargs->uplo, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, handle ); } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zsymm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + + struct cl_zsymm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zsymm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_zsymm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zsymm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ + rt_starpu_insert_task( + &cl_zsymm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_zsymm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zsymm; - void (*callback)(void*) = options->profiling ? cl_zsymm_callback : NULL; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_zsymm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zsymm"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zsymm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zsymm_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zsymm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zsymm", + STARPU_NAME, cl_name, #endif - 0); + 0 ); } diff --git a/testing/testing_zhemm.c b/testing/testing_zhemm.c index 38d91da96d7d96794e665a0551d196bd055e8779..e80471c8fd687bb301f5b703442f17a6ce9e2366 100644 --- a/testing/testing_zhemm.c +++ b/testing/testing_zhemm.c @@ -57,6 +57,7 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) /* Descriptors */ int Am; CHAM_desc_t *descA, *descB, *descC, *descCinit; + void *ws = NULL; bump = run_arg_get_double( args, "bump", bump ); alpha = run_arg_get_complex64( args, "alpha", alpha ); @@ -85,11 +86,15 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) CHAMELEON_zplrnt_Tile( descB, seedB ); CHAMELEON_zplrnt_Tile( descC, seedC ); + if ( async ) { + ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, descA, descB, descC ); + } + /* Calculates the product */ testing_start( &test_data ); if ( async ) { hres = CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC, - test_data.sequence, &test_data.request ); + ws, test_data.sequence, &test_data.request ); CHAMELEON_Desc_Flush( descA, test_data.sequence ); CHAMELEON_Desc_Flush( descB, test_data.sequence ); CHAMELEON_Desc_Flush( descC, test_data.sequence ); @@ -100,6 +105,10 @@ testing_zhemm_desc( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zhemm( side, M, N ) ); + if ( ws != NULL ) { + CHAMELEON_zhemm_WS_Free( ws ); + } + /* Checks the solution */ if ( check ) { CHAMELEON_Desc_Create( diff --git a/testing/testing_zsymm.c b/testing/testing_zsymm.c index b4edb1e840c4d095e6832eba9ea7759d9810c0db..39f9663010ec11b9cb4f608eb99ae0366dbce624 100644 --- a/testing/testing_zsymm.c +++ b/testing/testing_zsymm.c @@ -57,6 +57,7 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) /* Descriptors */ int Am; CHAM_desc_t *descA, *descB, *descC, *descCinit; + void *ws = NULL; bump = run_arg_get_double( args, "bump", bump ); alpha = run_arg_get_complex64( args, "alpha", alpha ); @@ -85,11 +86,15 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) CHAMELEON_zplrnt_Tile( descB, seedB ); CHAMELEON_zplrnt_Tile( descC, seedC ); + if ( async ) { + ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, descA, descB, descC ); + } + /* Calculates the product */ testing_start( &test_data ); if ( async ) { hres = CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC, - test_data.sequence, &test_data.request ); + ws, test_data.sequence, &test_data.request ); CHAMELEON_Desc_Flush( descA, test_data.sequence ); CHAMELEON_Desc_Flush( descB, test_data.sequence ); CHAMELEON_Desc_Flush( descC, test_data.sequence ); @@ -100,6 +105,10 @@ testing_zsymm_desc( run_arg_list_t *args, int check ) test_data.hres = hres; testing_stop( &test_data, flops_zsymm( side, M, N ) ); + if ( ws != NULL ) { + CHAMELEON_zsymm_WS_Free( ws ); + } + /* Checks the solution */ if ( check ) { CHAMELEON_Desc_Create(