diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 93efe6fcaf43be3a57c8c8314a7b9c11f6c7448d..b41220165efa574408a907cdd54e6e55dd705652 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -23,47 +23,36 @@ */ #include "control/common.h" -#define A(m, n) A, m, n -#define B(m, n) B, m, n -#define C(m, n) C, m, n -#define WA(m, n) &WA, m, n -#define WB(m, n) &WB, m, n +#define A( _m_, _n_ ) A, (_m_), (_n_) +#define B( _m_, _n_ ) B, (_m_), (_n_) +#define C( _m_, _n_ ) C, (_m_), (_n_) +#define WA( _m_, _n_ ) WA, (_m_), (_n_) +#define WB( _m_, _n_ ) WB, (_m_), (_n_) /** * Parallel tile hermitian matrix-matrix multiplication. * SUMMA algorithm for 2D block-cyclic data distribution. */ static inline void -chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_option_t *options ) +chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, + RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - cham_trans_t transA, transB; - int Am, An, m, n, k, p, q, KT, K, lp, lq; - int ldam, ldbk, ldbm, ldcm; + cham_trans_t transA; + int m, n, k, p, q, KT, K, lp, lq; + int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; CHAMELEON_Complex64_t zbeta; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; - CHAM_desc_t WA, WB; lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); - - KT = side == ChamLeft ? A->nt : A->mt; - K = side == ChamLeft ? A->n : A->m; + KT = A->nt; + K = A->n; myp = C->myrank / C->q; myq = C->myrank % C->q; @@ -72,162 +61,291 @@ chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t lq = (k % lookahead) * C->q; tempkk = k == KT - 1 ? K - k * A->nb : A->nb; zbeta = k == 0 ? beta : zone; - ldbk = BLKLDD(B, k); /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m ++ ) { - tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + int Am, Ak, ldam; + int tempam, tempak; - if ( side == ChamLeft ) { - if ( (( uplo == ChamUpper ) && ( m > k )) || - (( uplo == ChamLower ) && ( m < k )) ) { - Am = k; - An = m; - } - else { - Am = m; - An = k; - } - ldam = BLKLDD(A, Am); - - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - A( Am, An ), ldam, - WA( m, (k % C->q) + lq ), WA.mb ); - - RUNTIME_data_flush( sequence, A( Am, An ) ); + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; - for ( q=1; q < C->q; q++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); - } + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) + { + /* Let's take A( k, m ) */ + Am = k; + Ak = m; + tempam = tempkk; + tempak = tempmm; } else { - ldbm = BLKLDD(B, m); + /* Let's take A( m, k ) */ + Am = m; + Ak = k; + tempam = tempmm; + tempak = tempkk; + } + ldam = BLKLDD( A, Am ); - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - B( m, k ), ldbm, - WA( m, (k % C->q) + lq ), WA.mb ); + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempam, tempak, C->mb, + A( Am, Ak ), ldam, + WA( m, (k % C->q) + lq ), WA->mb ); - RUNTIME_data_flush( sequence, B( m, k ) ); + RUNTIME_data_flush( sequence, A( Am, Ak ) ); - for ( q=1; q < C->q; q++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); - } + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempam, tempak, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA->mb, + WA( m, ((k+q) % C->q) + lq ), WA->mb ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { + int ldbk; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + ldbk = BLKLDD( B, k ); - if ( side == ChamRight ) { - if ( (( uplo == ChamUpper ) && ( n < k )) || - (( uplo == ChamLower ) && ( n > k )) ) { - Am = n; - An = k; - } - else { - Am = k; - An = n; - } - ldam = BLKLDD(A, Am); + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + B( k, n ), ldbk, + WB( (k % C->p) + lp, n ), WB->mb ); + RUNTIME_data_flush( sequence, B( k, n ) ); + + for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - A( Am, An ), ldam, - WB( (k % C->p) + lp, n ), WB.mb ); + WB( ((k+p-1) % C->p) + lp, n ), WB->mb, + WB( ((k+p) % C->p) + lp, n ), WB->mb ); + } + } + + /* Perform the update of this iteration */ + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_flush( sequence, A( Am, An ) ); + if ( k == m ) { + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - for ( p=1; p < C->p; p++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); + INSERT_TASK_zhemm( + options, ChamLeft, uplo, + tempmm, tempnn, A->mb, + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); } } else { + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) + { + transA = ChamConjTrans; + } + else { + transA = ChamNoTrans; + } + + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + INSERT_TASK_zgemm( + options, transA, ChamNoTrans, + tempmm, tempnn, tempkk, A->mb, + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); + } + } + } + } +} + +/** + * Parallel tile hermitian matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. + */ +static inline void +chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, + RUNTIME_option_t *options ) +{ + RUNTIME_sequence_t *sequence = options->sequence; + cham_trans_t transA; + int m, n, k, p, q, KT, K, lp, lq; + int ldcm; + int tempmm, tempnn, tempkk; + int lookahead, myp, myq; + + CHAMELEON_Complex64_t zbeta; + CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + + lookahead = chamctxt->lookahead; + KT = A->mt; + K = A->m; + myp = C->myrank / C->q; + myq = C->myrank % C->q; + + for (k = 0; k < KT; k++ ) { + lp = (k % lookahead) * C->p; + lq = (k % lookahead) * C->q; + tempkk = k == KT - 1 ? K - k * A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + + /* Transfert ownership of the k column of A or B */ + for (m = 0; m < C->mt; m++ ) { + int ldbm; + + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + ldbm = BLKLDD( B, m ); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + B( m, k ), ldbm, + WA( m, (k % C->q) + lq ), WA->mb ); + + RUNTIME_data_flush( sequence, B( m, k ) ); + + for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, - B( k, n ), ldbk, - WB( (k % C->p) + lp, n ), WB.mb ); + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA->mb, + WA( m, ((k+q) % C->q) + lq ), WA->mb ); + } + } - RUNTIME_data_flush( sequence, B( k, n ) ); + /* Transfert ownership of the k row of B, or A */ + for (n = 0; n < C->nt; n++) { + int Ak, An, ldak; + int tempak, tempan; - for ( p=1; p < C->p; p++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); - } + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) + { + Ak = n; + An = k; + tempak = tempnn; + tempan = tempkk; + } + else + { + Ak = k; + An = n; + tempak = tempkk; + tempan = tempnn; + } + ldak = BLKLDD( A, Ak ); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempak, tempan, C->mb, + A( Ak, An ), ldak, + WB( (k % C->p) + lp, n ), WB->mb ); + + RUNTIME_data_flush( sequence, A( Ak, An ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempak, tempan, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB->mb, + WB( ((k+p) % C->p) + lp, n ), WB->mb ); } } - /* - * ChamLeft / ChamLower - */ - for (m = myp; m < C->mt; m+=C->p) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); + /* Perform the update of this iteration */ + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - for (n = myq; n < C->nt; n+=C->q) { - tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + if ( k == n ) { + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - if (side == ChamLeft) { - transB = ChamNoTrans; - if ( (( uplo == ChamUpper ) && ( m > k )) || - (( uplo == ChamLower ) && ( m < k )) ) { - transA = ChamConjTrans; - } - else { - transA = ChamNoTrans; - } + /* A has been stored in WA or WB for the summa ring */ + INSERT_TASK_zhemm( + options, ChamRight, uplo, + tempmm, tempnn, A->mb, + alpha, WB( myp + lp, n ), WB->mb, + WA( m, myq + lq ), WA->mb, + zbeta, C( m, n ), ldcm ); + } + } + else { + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) + { + transA = ChamConjTrans; } else { transA = ChamNoTrans; - if ( (( uplo == ChamUpper ) && ( n < k )) || - (( uplo == ChamLower ) && ( n > k )) ) { - transB = ChamConjTrans; - } - else { - transB = ChamNoTrans; - } } - if ( k == m ) { - INSERT_TASK_zhemm( - options, side, uplo, - tempmm, tempnn, A->mb, - alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ - WB( myp + lp, n ), WB.mb, /* ldb * Y */ - zbeta, C( m, n ), ldcm ); /* ldc * Y */ - } - else { + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + INSERT_TASK_zgemm( - options, transA, transB, + options, ChamNoTrans, transA, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ - WB( myp + lp, n ), WB.mb, /* ldb * Y */ - zbeta, C( m, n ), ldcm ); /* ldc * Y */ + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); } } } } +} + +/** + * Parallel tile hermitian matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. + */ +static inline void +chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + RUNTIME_sequence_t *sequence = options->sequence; + CHAM_desc_t WA, WB; + int lookahead; + + lookahead = chamctxt->lookahead; + chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + + if (side == ChamLeft) { + chameleon_pzhemm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, + &WA, &WB, options ); + } + else { + chameleon_pzhemm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, + &WA, &WB, options ); + } RUNTIME_desc_flush( &WA, sequence ); RUNTIME_desc_flush( &WB, sequence ); @@ -453,7 +571,8 @@ chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, { chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); } - else { + else + { chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); } diff --git a/compute/pzher2k.c b/compute/pzher2k.c index 70cdd6b7b7fa3a2dc1e717c34e48ebaec75c1bb5..8f51860bb996a404e4307e4436a2f06e98d59b53 100644 --- a/compute/pzher2k.c +++ b/compute/pzher2k.c @@ -29,15 +29,15 @@ /** * Parallel tile Hermitian rank-k update - dynamic scheduling */ -void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - double beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +void chameleon_pzher2k( cham_uplo_t uplo, cham_trans_t trans, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + double beta, CHAM_desc_t *C, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) { CHAM_context_t *chamctxt; RUNTIME_option_t options; - int m, n, k; + int m, n, k, mmin, mmax; int ldak, ldam, ldan, ldcm, ldcn; int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; @@ -57,6 +57,16 @@ void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, ldan = BLKLDD(A, n); ldbn = BLKLDD(B, n); ldcn = BLKLDD(C, n); + + if (uplo == ChamLower) { + mmin = n+1; + mmax = C->mt; + } + else { + mmin = 0; + mmax = n; + } + /* * ChamNoTrans */ @@ -72,68 +82,34 @@ void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, B(n, k), ldbn, dbeta, C(n, n), ldcn); /* ldc * N */ } - /* - * ChamNoTrans / ChamLower - */ - if (uplo == ChamLower) { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); - for (k = 0; k < A->nt; k++) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamConjTrans, - tempmm, tempnn, tempkn, A->mb, - conj(alpha), A(m, k), ldam, /* ldam * K */ - B(n, k), ldbn, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldc * N */ + for (m = mmin; m < mmax; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldam = BLKLDD(A, m); + ldbm = BLKLDD(B, m); + ldcm = BLKLDD(C, m); + for (k = 0; k < A->nt; k++) { + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; + INSERT_TASK_zgemm( + &options, + ChamNoTrans, ChamConjTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, A(m, k), ldam, + B(n, k), ldbn, + zbeta, C(m, n), ldcm); - INSERT_TASK_zgemm( - &options, - trans, ChamConjTrans, - tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldam * K */ - A(n, k), ldan, /* ldan * K */ - zone, C(m, n), ldcm); /* ldc * N */ - } - } - } - /* - * ChamNoTrans / ChamUpper - */ - else { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - for (k = 0; k < A->nt; k++) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamConjTrans, - tempnn, tempmm, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - B(m, k), ldbm, /* ldam * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ - - INSERT_TASK_zgemm( - &options, - trans, ChamConjTrans, - tempnn, tempmm, tempkn, A->mb, - conj(alpha), B(n, k), ldan, /* ldan * K */ - A(m, k), ldam, /* ldam * M */ - zone, C(n, m), ldcn); /* ldc * M */ - } + INSERT_TASK_zgemm( + &options, + ChamNoTrans, ChamConjTrans, + tempmm, tempnn, tempkn, A->mb, + conj(alpha), B(m, k), ldbm, + A(n, k), ldan, + zone, C(m, n), ldcm); } } } /* - * Cham[Conj]Trans + * ChamConjTrans */ else { for (k = 0; k < A->mt; k++) { @@ -149,63 +125,29 @@ void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, B(k, n), ldbk, dbeta, C(n, n), ldcn); /* ldc * N */ } - /* - * Cham[Conj]Trans / ChamLower - */ - if (uplo == ChamLower) { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); - for (k = 0; k < A->mt; k++) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); - zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* lda * M */ - B(k, n), ldbk, /* lda * N */ - zbeta, C(m, n), ldcm); /* ldc * N */ - - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempmm, tempnn, tempkm, A->mb, - alpha, B(k, m), ldbk, /* lda * M */ - A(k, n), ldak, /* lda * N */ - zone, C(m, n), ldcm); /* ldc * N */ - } - } - } - /* - * Cham[Conj]Trans / ChamUpper - */ - else { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - for (k = 0; k < A->mt; k++) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); - zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempnn, tempmm, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * K */ - B(k, m), ldbk, /* lda * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + for (m = mmin; m < mmax; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + for (k = 0; k < A->mt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + ldak = BLKLDD(A, k); + ldbk = BLKLDD(B, k); + zbeta = k == 0 ? (CHAMELEON_Complex64_t)beta : zone; + INSERT_TASK_zgemm( + &options, + ChamConjTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), ldak, + B(k, n), ldbk, + zbeta, C(m, n), ldcm); - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempnn, tempmm, tempkm, A->mb, - conj(alpha), B(k, n), ldbk, /* lda * K */ - A(k, m), ldak, /* lda * M */ - zone, C(n, m), ldcn); /* ldc * M */ - } + INSERT_TASK_zgemm( + &options, + ChamConjTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + conj(alpha), B(k, m), ldbk, + A(k, n), ldak, + zone, C(m, n), ldcm ); } } } diff --git a/compute/pzsymm.c b/compute/pzsymm.c index 0413d63b5f94fe5266ec2140232998360b175de8..f9d724c08ab8835e1dddffc332bb4298facc9ee2 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -23,47 +23,36 @@ */ #include "control/common.h" -#define A(m, n) A, m, n -#define B(m, n) B, m, n -#define C(m, n) C, m, n -#define WA(m, n) &WA, m, n -#define WB(m, n) &WB, m, n +#define A( _m_, _n_ ) A, (_m_), (_n_) +#define B( _m_, _n_ ) B, (_m_), (_n_) +#define C( _m_, _n_ ) C, (_m_), (_n_) +#define WA( _m_, _n_ ) WA, (_m_), (_n_) +#define WB( _m_, _n_ ) WB, (_m_), (_n_) /** * Parallel tile symmetric matrix-matrix multiplication. * SUMMA algorithm for 2D block-cyclic data distribution. */ static inline void -chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_option_t *options ) +chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, + RUNTIME_option_t *options ) { RUNTIME_sequence_t *sequence = options->sequence; - cham_trans_t transA, transB; - int Am, An, m, n, k, p, q, KT, K, lp, lq; - int ldam, ldbk, ldbm, ldcm; + cham_trans_t transA; + int m, n, k, p, q, KT, K, lp, lq; + int ldcm; int tempmm, tempnn, tempkk; int lookahead, myp, myq; CHAMELEON_Complex64_t zbeta; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; - CHAM_desc_t WA, WB; lookahead = chamctxt->lookahead; - chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, - C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, - NULL, NULL, NULL ); - chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, - ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), - C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, - C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, - NULL, NULL, NULL ); - - KT = side == ChamLeft ? A->nt : A->mt; - K = side == ChamLeft ? A->n : A->m; + KT = A->nt; + K = A->n; myp = C->myrank / C->q; myq = C->myrank % C->q; @@ -72,162 +61,291 @@ chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t lq = (k % lookahead) * C->q; tempkk = k == KT - 1 ? K - k * A->nb : A->nb; zbeta = k == 0 ? beta : zone; - ldbk = BLKLDD(B, k); /* Transfert ownership of the k column of A or B */ for (m = 0; m < C->mt; m ++ ) { - tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + int Am, Ak, ldam; + int tempam, tempak; - if ( side == ChamLeft ) { - if ( (( uplo == ChamUpper ) && ( m > k )) || - (( uplo == ChamLower ) && ( m < k )) ) { - Am = k; - An = m; - } - else { - Am = m; - An = k; - } - ldam = BLKLDD(A, Am); - - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - A( Am, An ), ldam, - WA( m, (k % C->q) + lq ), WA.mb ); - - RUNTIME_data_flush( sequence, A( Am, An ) ); + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; - for ( q=1; q < C->q; q++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); - } + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) + { + /* Let's take A( k, m ) */ + Am = k; + Ak = m; + tempam = tempkk; + tempak = tempmm; } else { - ldbm = BLKLDD(B, m); + /* Let's take A( m, k ) */ + Am = m; + Ak = k; + tempam = tempmm; + tempak = tempkk; + } + ldam = BLKLDD( A, Am ); - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - B( m, k ), ldbm, - WA( m, (k % C->q) + lq ), WA.mb ); + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempam, tempak, C->mb, + A( Am, Ak ), ldam, + WA( m, (k % C->q) + lq ), WA->mb ); - RUNTIME_data_flush( sequence, B( m, k ) ); + RUNTIME_data_flush( sequence, A( Am, Ak ) ); - for ( q=1; q < C->q; q++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempmm, tempkk, C->mb, - WA( m, ((k+q-1) % C->q) + lq ), WA.mb, - WA( m, ((k+q) % C->q) + lq ), WA.mb ); - } + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempam, tempak, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA->mb, + WA( m, ((k+q) % C->q) + lq ), WA->mb ); } } /* Transfert ownership of the k row of B, or A */ for (n = 0; n < C->nt; n++) { + int ldbk; + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + ldbk = BLKLDD( B, k ); - if ( side == ChamRight ) { - if ( (( uplo == ChamUpper ) && ( n < k )) || - (( uplo == ChamLower ) && ( n > k )) ) { - Am = n; - An = k; - } - else { - Am = k; - An = n; - } - ldam = BLKLDD(A, Am); + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + B( k, n ), ldbk, + WB( (k % C->p) + lp, n ), WB->mb ); + RUNTIME_data_flush( sequence, B( k, n ) ); + + for ( p=1; p < C->p; p++ ) { INSERT_TASK_zlacpy( options, ChamUpperLower, tempkk, tempnn, C->mb, - A( Am, An ), ldam, - WB( (k % C->p) + lp, n ), WB.mb ); + WB( ((k+p-1) % C->p) + lp, n ), WB->mb, + WB( ((k+p) % C->p) + lp, n ), WB->mb ); + } + } + + /* Perform the update of this iteration */ + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - RUNTIME_data_flush( sequence, A( Am, An ) ); + if ( k == m ) { + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - for ( p=1; p < C->p; p++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); + INSERT_TASK_zsymm( + options, ChamLeft, uplo, + tempmm, tempnn, A->mb, + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); } } else { + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) + { + transA = ChamTrans; + } + else { + transA = ChamNoTrans; + } + + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + INSERT_TASK_zgemm( + options, transA, ChamNoTrans, + tempmm, tempnn, tempkk, A->mb, + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); + } + } + } + } +} + +/** + * Parallel tile symmetric matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. + */ +static inline void +chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + CHAM_desc_t *WA, CHAM_desc_t *WB, + RUNTIME_option_t *options ) +{ + RUNTIME_sequence_t *sequence = options->sequence; + cham_trans_t transA; + int m, n, k, p, q, KT, K, lp, lq; + int ldcm; + int tempmm, tempnn, tempkk; + int lookahead, myp, myq; + + CHAMELEON_Complex64_t zbeta; + CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + + lookahead = chamctxt->lookahead; + KT = A->mt; + K = A->m; + myp = C->myrank / C->q; + myq = C->myrank % C->q; + + for (k = 0; k < KT; k++ ) { + lp = (k % lookahead) * C->p; + lq = (k % lookahead) * C->q; + tempkk = k == KT - 1 ? K - k * A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + + /* Transfert ownership of the k column of A or B */ + for (m = 0; m < C->mt; m++ ) { + int ldbm; + + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + ldbm = BLKLDD( B, m ); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + B( m, k ), ldbm, + WA( m, (k % C->q) + lq ), WA->mb ); + + RUNTIME_data_flush( sequence, B( m, k ) ); + + for ( q=1; q < C->q; q++ ) { INSERT_TASK_zlacpy( options, - ChamUpperLower, tempkk, tempnn, C->mb, - B( k, n ), ldbk, - WB( (k % C->p) + lp, n ), WB.mb ); + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA->mb, + WA( m, ((k+q) % C->q) + lq ), WA->mb ); + } + } - RUNTIME_data_flush( sequence, B( k, n ) ); + /* Transfert ownership of the k row of B, or A */ + for (n = 0; n < C->nt; n++) { + int Ak, An, ldak; + int tempak, tempan; - for ( p=1; p < C->p; p++ ) { - INSERT_TASK_zlacpy( - options, - ChamUpperLower, tempkk, tempnn, C->mb, - WB( ((k+p-1) % C->p) + lp, n ), WB.mb, - WB( ((k+p) % C->p) + lp, n ), WB.mb ); - } + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) + { + Ak = n; + An = k; + tempak = tempnn; + tempan = tempkk; + } + else + { + Ak = k; + An = n; + tempak = tempkk; + tempan = tempnn; + } + ldak = BLKLDD( A, Ak ); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempak, tempan, C->mb, + A( Ak, An ), ldak, + WB( (k % C->p) + lp, n ), WB->mb ); + + RUNTIME_data_flush( sequence, A( Ak, An ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempak, tempan, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB->mb, + WB( ((k+p) % C->p) + lp, n ), WB->mb ); } } - /* - * ChamLeft / ChamLower - */ - for (m = myp; m < C->mt; m+=C->p) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); + /* Perform the update of this iteration */ + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - for (n = myq; n < C->nt; n+=C->q) { - tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + if ( k == n ) { + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); - if (side == ChamLeft) { - transB = ChamNoTrans; - if ( (( uplo == ChamUpper ) && ( m > k )) || - (( uplo == ChamLower ) && ( m < k )) ) { - transA = ChamTrans; - } - else { - transA = ChamNoTrans; - } + /* A has been stored in WA or WB for the summa ring */ + INSERT_TASK_zsymm( + options, ChamRight, uplo, + tempmm, tempnn, A->mb, + alpha, WB( myp + lp, n ), WB->mb, + WA( m, myq + lq ), WA->mb, + zbeta, C( m, n ), ldcm ); + } + } + else { + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) + { + transA = ChamTrans; } else { transA = ChamNoTrans; - if ( (( uplo == ChamUpper ) && ( n < k )) || - (( uplo == ChamLower ) && ( n > k )) ) { - transB = ChamTrans; - } - else { - transB = ChamNoTrans; - } } - if ( k == m ) { - INSERT_TASK_zsymm( - options, side, uplo, - tempmm, tempnn, A->mb, - alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ - WB( myp + lp, n ), WB.mb, /* ldb * Y */ - zbeta, C( m, n ), ldcm ); /* ldc * Y */ - } - else { + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + INSERT_TASK_zgemm( - options, transA, transB, + options, ChamNoTrans, transA, tempmm, tempnn, tempkk, A->mb, - alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ - WB( myp + lp, n ), WB.mb, /* ldb * Y */ - zbeta, C( m, n ), ldcm ); /* ldc * Y */ + alpha, WA( m, myq + lq ), WA->mb, + WB( myp + lp, n ), WB->mb, + zbeta, C( m, n ), ldcm ); } } } } +} + +/** + * Parallel tile symmetric matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. + */ +static inline void +chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + RUNTIME_sequence_t *sequence = options->sequence; + CHAM_desc_t WA, WB; + int lookahead; + + lookahead = chamctxt->lookahead; + chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + + if (side == ChamLeft) { + chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C, + &WA, &WB, options ); + } + else { + chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C, + &WA, &WB, options ); + } RUNTIME_desc_flush( &WA, sequence ); RUNTIME_desc_flush( &WB, sequence ); @@ -453,7 +571,8 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, { chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); } - else { + else + { chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); } diff --git a/compute/pzsyr2k.c b/compute/pzsyr2k.c index 17ae3ee8bbaad7f560b9f80d2d0bb0b53789a7a5..0b5f7195f398bd8319a38868cc6ac8ac7de92d55 100644 --- a/compute/pzsyr2k.c +++ b/compute/pzsyr2k.c @@ -29,15 +29,15 @@ /** * Parallel tile Hermitian rank-k update - dynamic scheduling */ -void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +void chameleon_pzsyr2k( cham_uplo_t uplo, cham_trans_t trans, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) { CHAM_context_t *chamctxt; RUNTIME_option_t options; - int m, n, k; + int m, n, k, mmin, mmax; int ldak, ldam, ldan, ldcm, ldcn; int ldbk, ldbm, ldbn; int tempnn, tempmm, tempkn, tempkm; @@ -56,6 +56,16 @@ void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, ldan = BLKLDD(A, n); ldbn = BLKLDD(B, n); ldcn = BLKLDD(C, n); + + if (uplo == ChamLower) { + mmin = n+1; + mmax = C->mt; + } + else { + mmin = 0; + mmax = n; + } + /* * ChamNoTrans */ @@ -71,68 +81,34 @@ void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, B(n, k), ldbn, zbeta, C(n, n), ldcn); /* ldc * N */ } - /* - * ChamNoTrans / ChamLower - */ - if (uplo == ChamLower) { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); - for (k = 0; k < A->nt; k++) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - zbeta = k == 0 ? beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamTrans, - tempmm, tempnn, tempkn, A->mb, - alpha, A(m, k), ldam, /* ldam * K */ - B(n, k), ldbn, /* ldan * K */ - zbeta, C(m, n), ldcm); /* ldc * N */ + for (m = mmin; m < mmax; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldam = BLKLDD(A, m); + ldbm = BLKLDD(B, m); + ldcm = BLKLDD(C, m); + for (k = 0; k < A->nt; k++) { + tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + INSERT_TASK_zgemm( + &options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, A(m, k), ldam, + B(n, k), ldbn, + zbeta, C(m, n), ldcm); - INSERT_TASK_zgemm( - &options, - trans, ChamTrans, - tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, /* ldam * K */ - A(n, k), ldan, /* ldan * K */ - zone, C(m, n), ldcm); /* ldc * N */ - } - } - } - /* - * ChamNoTrans / ChamUpper - */ - else { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldbm = BLKLDD(B, m); - for (k = 0; k < A->nt; k++) { - tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; - zbeta = k == 0 ? beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamTrans, - tempnn, tempmm, tempkn, A->mb, - alpha, A(n, k), ldan, /* ldan * K */ - B(m, k), ldbm, /* ldam * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ - - INSERT_TASK_zgemm( - &options, - trans, ChamTrans, - tempnn, tempmm, tempkn, A->mb, - alpha, B(n, k), ldan, /* ldan * K */ - A(m, k), ldam, /* ldam * M */ - zone, C(n, m), ldcn); /* ldc * M */ - } + INSERT_TASK_zgemm( + &options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), ldbm, + A(n, k), ldan, + zone, C(m, n), ldcm); } } } /* - * Cham[Conj]Trans + * ChamTrans */ else { for (k = 0; k < A->mt; k++) { @@ -148,63 +124,29 @@ void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, B(k, n), ldbk, zbeta, C(n, n), ldcn); /* ldc * N */ } - /* - * Cham[Conj]Trans / ChamLower - */ - if (uplo == ChamLower) { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldcm = BLKLDD(C, m); - for (k = 0; k < A->mt; k++) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); - zbeta = k == 0 ? beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, /* lda * M */ - B(k, n), ldbk, /* lda * N */ - zbeta, C(m, n), ldcm); /* ldc * N */ - - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempmm, tempnn, tempkm, A->mb, - alpha, B(k, m), ldbk, /* lda * M */ - A(k, n), ldak, /* lda * N */ - zone, C(m, n), ldcm); /* ldc * N */ - } - } - } - /* - * Cham[Conj]Trans / ChamUpper - */ - else { - for (m = n+1; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - for (k = 0; k < A->mt; k++) { - tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); - zbeta = k == 0 ? beta : zone; - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempnn, tempmm, tempkm, A->mb, - alpha, A(k, n), ldak, /* lda * K */ - B(k, m), ldbk, /* lda * M */ - zbeta, C(n, m), ldcn); /* ldc * M */ + for (m = mmin; m < mmax; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + for (k = 0; k < A->mt; k++) { + tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb; + ldak = BLKLDD(A, k); + ldbk = BLKLDD(B, k); + zbeta = k == 0 ? beta : zone; + INSERT_TASK_zgemm( + &options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, A(k, m), ldak, + B(k, n), ldbk, + zbeta, C(m, n), ldcm); - INSERT_TASK_zgemm( - &options, - trans, ChamNoTrans, - tempnn, tempmm, tempkm, A->mb, - alpha, B(k, n), ldbk, /* lda * K */ - A(k, m), ldak, /* lda * M */ - zone, C(n, m), ldcn); /* ldc * M */ - } + INSERT_TASK_zgemm( + &options, + ChamTrans, ChamNoTrans, + tempmm, tempnn, tempkm, A->mb, + alpha, B(k, m), ldbk, + A(k, n), ldak, + zone, C(m, n), ldcm ); } } }