diff --git a/compute/pzgemm.c b/compute/pzgemm.c index e0d51ac963a5078363f6b7386dc06c5e2e221bec..d6d9c7eec81e74a2da0bcd0eea344ccf2dacc98b 100644 --- a/compute/pzgemm.c +++ b/compute/pzgemm.c @@ -26,16 +26,176 @@ #define A(m, n) A, m, n #define B(m, n) B, m, n #define C(m, n) C, m, n +#define WA(m, n) &WA, m, n +#define WB(m, n) &WB, m, n + /** - * Parallel tile matrix-matrix multiplication - dynamic scheduling + * Parallel tile matrix-matrix multiplication + * SUMMA algorithm for 2D block-cyclic data distribution. */ -void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +static inline void +chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) { - CHAM_context_t *chamctxt; - RUNTIME_option_t options; + RUNTIME_sequence_t *sequence = options->sequence; + int m, n, k, p, q, KT, K, lp, lq; + int ldam, ldak, ldbn, ldbk, ldcm; + int tempmm, tempnn, tempkk; + int lookahead, myp, myq; + + CHAMELEON_Complex64_t zbeta; + CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + CHAM_desc_t WA, WB; + + lookahead = chamctxt->lookahead; + chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + + KT = transA == ChamNoTrans ? A->nt : A->mt; + K = transA == ChamNoTrans ? A->n : A->m; + myp = A->myrank / A->q; + myq = A->myrank % A->q; + + /* + * A: ChamNoTrans / B: ChamNoTrans + */ + for (k = 0; k < KT; k++ ) { + lp = (k % lookahead) * C->p; + lq = (k % lookahead) * C->q; + tempkk = k == KT - 1 ? K - k * A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + ldak = BLKLDD(A, k); + ldbk = BLKLDD(B, k); + + /* Transfert ownership of the k column of A */ + for (m = 0; m < C->mt; m ++ ) { + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + ldam = BLKLDD(A, m); + + if ( transA == ChamNoTrans ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + A( m, k ), ldam, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, A( m, k ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + else { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempmm, C->mb, + A( k, m ), ldak, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, A( k, m ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempmm, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + } + + /* Transfert ownership of the k row of B */ + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + ldbn = BLKLDD(B, n); + + if ( transB == ChamNoTrans ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + B( k, n ), ldbk, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, B( k, n ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + else { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempnn, tempkk, C->mb, + B( n, k ), ldbn, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, B( n, k ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempnn, tempkk, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + } + + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + INSERT_TASK_zgemm( + options, + transA, transB, + tempmm, tempnn, tempkk, A->mb, + alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ + WB( myp + lp, n ), WB.mb, /* ldb * Y */ + zbeta, C( m, n ), ldcm ); /* ldc * Y */ + } + } + } + + RUNTIME_desc_flush( &WA, sequence ); + RUNTIME_desc_flush( &WB, sequence ); + RUNTIME_desc_flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + chameleon_desc_destroy( &WA ); + chameleon_desc_destroy( &WB ); +} + +/** + * Parallel tile matrix-matrix multiplication. + * Generic algorithm for any data distribution. + */ +static inline void +chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + RUNTIME_sequence_t *sequence = options->sequence; int m, n, k; int ldam, ldak, ldbn, ldbk, ldcm; @@ -44,12 +204,6 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, CHAMELEON_Complex64_t zbeta; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; - chamctxt = chameleon_context_self(); - if (sequence->status != CHAMELEON_SUCCESS) { - return; - } - RUNTIME_options_init(&options, chamctxt, sequence, request); - for (m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldcm = BLKLDD(C, m); @@ -66,11 +220,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( - &options, + options, transA, transB, tempmm, tempnn, tempkn, A->mb, alpha, A(m, k), ldam, /* lda * Z */ - B(k, n), ldbk, /* ldb * Y */ + B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } @@ -83,11 +237,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( - &options, + options, transA, transB, tempmm, tempnn, tempkn, A->mb, alpha, A(m, k), ldam, /* lda * Z */ - B(n, k), ldbn, /* ldb * Z */ + B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } @@ -103,11 +257,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, ldbk = BLKLDD(B, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( - &options, + options, transA, transB, tempmm, tempnn, tempkm, A->mb, alpha, A(k, m), ldak, /* lda * X */ - B(k, n), ldbk, /* ldb * Y */ + B(k, n), ldbk, /* ldb * Y */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } @@ -121,11 +275,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, ldak = BLKLDD(A, k); zbeta = k == 0 ? beta : zone; INSERT_TASK_zgemm( - &options, + options, transA, transB, tempmm, tempnn, tempkm, A->mb, alpha, A(k, m), ldak, /* lda * X */ - B(n, k), ldbn, /* ldb * Z */ + B(n, k), ldbn, /* ldb * Z */ zbeta, C(m, n), ldcm); /* ldc * Y */ } } @@ -142,5 +296,37 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB, } } } - RUNTIME_options_finalize(&options, chamctxt); + + (void)chamctxt; +} + +/** + * Parallel tile matrix-matrix multiplication. wrapper. + */ +void +chameleon_pzgemm( cham_trans_t transA, cham_trans_t transB, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + if ( ((C->p > 1) || (C->q > 1)) && + (C->get_rankof == chameleon_getrankof_2d) && + (chamctxt->generic_enabled != CHAMELEON_TRUE) ) + { + chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options ); + } + + RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/compute/pzhemm.c b/compute/pzhemm.c index 9fcaf6c3797659380fb06ec012db95b618bb4248..93efe6fcaf43be3a57c8c8314a7b9c11f6c7448d 100644 --- a/compute/pzhemm.c +++ b/compute/pzhemm.c @@ -23,20 +23,230 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define B(m,n) B, m, n -#define C(m,n) C, m, n +#define A(m, n) A, m, n +#define B(m, n) B, m, n +#define C(m, n) C, m, n +#define WA(m, n) &WA, m, n +#define WB(m, n) &WB, m, n + /** - * Parallel tile Hermitian matrix-matrix multiplication - dynamic scheduling + * Parallel tile hermitian matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. */ -void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +static inline void +chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) { - CHAM_context_t *chamctxt; - RUNTIME_option_t options; + RUNTIME_sequence_t *sequence = options->sequence; + cham_trans_t transA, transB; + int Am, An, m, n, k, p, q, KT, K, lp, lq; + int ldam, ldbk, ldbm, ldcm; + int tempmm, tempnn, tempkk; + int lookahead, myp, myq; + + CHAMELEON_Complex64_t zbeta; + CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + CHAM_desc_t WA, WB; + + lookahead = chamctxt->lookahead; + chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + + KT = side == ChamLeft ? A->nt : A->mt; + K = side == ChamLeft ? A->n : A->m; + myp = C->myrank / C->q; + myq = C->myrank % C->q; + + for (k = 0; k < KT; k++ ) { + lp = (k % lookahead) * C->p; + lq = (k % lookahead) * C->q; + tempkk = k == KT - 1 ? K - k * A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + ldbk = BLKLDD(B, k); + + /* Transfert ownership of the k column of A or B */ + for (m = 0; m < C->mt; m ++ ) { + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + + if ( side == ChamLeft ) { + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) { + Am = k; + An = m; + } + else { + Am = m; + An = k; + } + ldam = BLKLDD(A, Am); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + A( Am, An ), ldam, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, A( Am, An ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + else { + ldbm = BLKLDD(B, m); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + B( m, k ), ldbm, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, B( m, k ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + } + + /* Transfert ownership of the k row of B, or A */ + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + if ( side == ChamRight ) { + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) { + Am = n; + An = k; + } + else { + Am = k; + An = n; + } + ldam = BLKLDD(A, Am); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + A( Am, An ), ldam, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, A( Am, An ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + else { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + B( k, n ), ldbk, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, B( k, n ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + } + /* + * ChamLeft / ChamLower + */ + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + for (n = myq; n < C->nt; n+=C->q) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + if (side == ChamLeft) { + transB = ChamNoTrans; + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) { + transA = ChamConjTrans; + } + else { + transA = ChamNoTrans; + } + } + else { + transA = ChamNoTrans; + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) { + transB = ChamConjTrans; + } + else { + transB = ChamNoTrans; + } + } + + if ( k == m ) { + INSERT_TASK_zhemm( + options, side, uplo, + tempmm, tempnn, A->mb, + alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ + WB( myp + lp, n ), WB.mb, /* ldb * Y */ + zbeta, C( m, n ), ldcm ); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm( + options, transA, transB, + tempmm, tempnn, tempkk, A->mb, + alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ + WB( myp + lp, n ), WB.mb, /* ldb * Y */ + zbeta, C( m, n ), ldcm ); /* ldc * Y */ + } + } + } + } + + RUNTIME_desc_flush( &WA, sequence ); + RUNTIME_desc_flush( &WB, sequence ); + RUNTIME_desc_flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + chameleon_desc_destroy( &WA ); + chameleon_desc_destroy( &WB ); +} + +/** + * Parallel tile hermitian matrix-matrix multiplication. + * Generic algorithm for any data distribution. + */ +static inline void +chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ int k, m, n; int ldam, ldan, ldak, ldbk, ldbm, ldcm; int tempmm, tempnn, tempkn, tempkm; @@ -44,12 +254,6 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t zbeta; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; - chamctxt = chameleon_context_self(); - if (sequence->status != CHAMELEON_SUCCESS) { - return; - } - RUNTIME_options_init(&options, chamctxt, sequence, request); - for(m = 0; m < C->mt; m++) { tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; ldcm = BLKLDD(C, m); @@ -68,7 +272,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, alpha, A(m, k), ldam, /* lda * K */ @@ -78,7 +282,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, else { if (k == m) { INSERT_TASK_zhemm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, alpha, A(k, k), ldak, /* ldak * X */ @@ -87,7 +291,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, } else { INSERT_TASK_zgemm( - &options, + options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, alpha, A(k, m), ldak, /* ldak * X */ @@ -108,7 +312,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( - &options, + options, ChamConjTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, alpha, A(k, m), ldak, /* ldak * X */ @@ -118,7 +322,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, else { if (k == m) { INSERT_TASK_zhemm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, alpha, A(k, k), ldak, /* ldak * K */ @@ -127,7 +331,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, alpha, A(m, k), ldam, /* lda * K */ @@ -151,7 +355,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, alpha, B(m, k), ldbm, /* ldb * K */ @@ -161,7 +365,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, else { if (k == n) { INSERT_TASK_zhemm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, alpha, A(k, k), ldak, /* ldak * Y */ @@ -170,7 +374,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, alpha, B(m, k), ldbm, /* ldb * K */ @@ -190,7 +394,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, alpha, B(m, k), ldbm, /* ldb * K */ @@ -200,7 +404,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, else { if (k == n) { INSERT_TASK_zhemm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, alpha, A(k, k), ldak, /* ldak * Y */ @@ -209,7 +413,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamConjTrans, tempmm, tempnn, tempkn, A->mb, alpha, B(m, k), ldbm, /* ldb * K */ @@ -222,5 +426,36 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, } } } - RUNTIME_options_finalize(&options, chamctxt); + (void)chamctxt; +} + +/** + * Parallel tile hermitian matrix-matrix multiplication. wrapper. + */ +void +chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + if ( ((C->p > 1) || (C->q > 1)) && + (C->get_rankof == chameleon_getrankof_2d) && + (chamctxt->generic_enabled != CHAMELEON_TRUE) ) + { + chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + + RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/compute/pzsymm.c b/compute/pzsymm.c index f7df025e27e6814f9fe11d2fd96b158d1714ff02..0413d63b5f94fe5266ec2140232998360b175de8 100644 --- a/compute/pzsymm.c +++ b/compute/pzsymm.c @@ -23,234 +23,439 @@ */ #include "control/common.h" -#define A(m,n) A, m, n -#define B(m,n) B, m, n -#define C(m,n) C, m, n +#define A(m, n) A, m, n +#define B(m, n) B, m, n +#define C(m, n) C, m, n +#define WA(m, n) &WA, m, n +#define WB(m, n) &WB, m, n + /** - * Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling + * Parallel tile symmetric matrix-matrix multiplication. + * SUMMA algorithm for 2D block-cyclic data distribution. */ -void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, - CHAMELEON_Complex64_t beta, CHAM_desc_t *C, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +static inline void +chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) { - CHAM_context_t *chamctxt; - RUNTIME_option_t options; - - int k, m, n; - int ldak, ldam, ldan, ldbk, ldbm, ldcm; - int tempmm, tempnn, tempkn, tempkm; + RUNTIME_sequence_t *sequence = options->sequence; + cham_trans_t transA, transB; + int Am, An, m, n, k, p, q, KT, K, lp, lq; + int ldam, ldbk, ldbm, ldcm; + int tempmm, tempnn, tempkk; + int lookahead, myp, myq; CHAMELEON_Complex64_t zbeta; CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + CHAM_desc_t WA, WB; - chamctxt = chameleon_context_self(); - if (sequence->status != CHAMELEON_SUCCESS) { - return; - } - RUNTIME_options_init(&options, chamctxt, sequence, request); - - /* - * ChamLeft - */ - if (side == ChamLeft) { - for (k = 0; k < C->mt; k++) { - tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; - ldak = BLKLDD(A, k); - ldbk = BLKLDD(B, k); - zbeta = k == 0 ? beta : zone; - - for (n = 0; n < C->nt; n++) { + lookahead = chamctxt->lookahead; + chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mt * C->mb, C->nb * C->q * lookahead, 0, 0, + C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q, + NULL, NULL, NULL ); + chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE, + ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb), + C->mb * C->p * lookahead, C->nt * C->nb, 0, 0, + C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q, + NULL, NULL, NULL ); + + KT = side == ChamLeft ? A->nt : A->mt; + K = side == ChamLeft ? A->n : A->m; + myp = C->myrank / C->q; + myq = C->myrank % C->q; + + for (k = 0; k < KT; k++ ) { + lp = (k % lookahead) * C->p; + lq = (k % lookahead) * C->q; + tempkk = k == KT - 1 ? K - k * A->nb : A->nb; + zbeta = k == 0 ? beta : zone; + ldbk = BLKLDD(B, k); + + /* Transfert ownership of the k column of A or B */ + for (m = 0; m < C->mt; m ++ ) { + tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb; + + if ( side == ChamLeft ) { + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) { + Am = k; + An = m; + } + else { + Am = m; + An = k; + } + ldam = BLKLDD(A, Am); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + A( Am, An ), ldam, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, A( Am, An ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + else { + ldbm = BLKLDD(B, m); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + B( m, k ), ldbm, + WA( m, (k % C->q) + lq ), WA.mb ); + + RUNTIME_data_flush( sequence, B( m, k ) ); + + for ( q=1; q < C->q; q++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempmm, tempkk, C->mb, + WA( m, ((k+q-1) % C->q) + lq ), WA.mb, + WA( m, ((k+q) % C->q) + lq ), WA.mb ); + } + } + } + + /* Transfert ownership of the k row of B, or A */ + for (n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + + if ( side == ChamRight ) { + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) { + Am = n; + An = k; + } + else { + Am = k; + An = n; + } + ldam = BLKLDD(A, Am); + + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + A( Am, An ), ldam, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, A( Am, An ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + else { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + B( k, n ), ldbk, + WB( (k % C->p) + lp, n ), WB.mb ); + + RUNTIME_data_flush( sequence, B( k, n ) ); + + for ( p=1; p < C->p; p++ ) { + INSERT_TASK_zlacpy( + options, + ChamUpperLower, tempkk, tempnn, C->mb, + WB( ((k+p-1) % C->p) + lp, n ), WB.mb, + WB( ((k+p) % C->p) + lp, n ), WB.mb ); + } + } + } + + /* + * ChamLeft / ChamLower + */ + for (m = myp; m < C->mt; m+=C->p) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + + for (n = myq; n < C->nt; n+=C->q) { tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - for (m = 0; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; - ldam = BLKLDD(A, m); - ldcm = BLKLDD(C, m); + if (side == ChamLeft) { + transB = ChamNoTrans; + if ( (( uplo == ChamUpper ) && ( m > k )) || + (( uplo == ChamLower ) && ( m < k )) ) { + transA = ChamTrans; + } + else { + transA = ChamNoTrans; + } + } + else { + transA = ChamNoTrans; + if ( (( uplo == ChamUpper ) && ( n < k )) || + (( uplo == ChamLower ) && ( n > k )) ) { + transB = ChamTrans; + } + else { + transB = ChamNoTrans; + } + } - /* - * ChamLeft / ChamLower - */ - if (uplo == ChamLower) { + if ( k == m ) { + INSERT_TASK_zsymm( + options, side, uplo, + tempmm, tempnn, A->mb, + alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ + WB( myp + lp, n ), WB.mb, /* ldb * Y */ + zbeta, C( m, n ), ldcm ); /* ldc * Y */ + } + else { + INSERT_TASK_zgemm( + options, transA, transB, + tempmm, tempnn, tempkk, A->mb, + alpha, WA( m, myq + lq ), WA.mb, /* lda * Z */ + WB( myp + lp, n ), WB.mb, /* ldb * Y */ + zbeta, C( m, n ), ldcm ); /* ldc * Y */ + } + } + } + } + + RUNTIME_desc_flush( &WA, sequence ); + RUNTIME_desc_flush( &WB, sequence ); + RUNTIME_desc_flush( C, sequence ); + chameleon_sequence_wait( chamctxt, sequence ); + chameleon_desc_destroy( &WA ); + chameleon_desc_destroy( &WB ); +} + +/** + * Parallel tile symmetric matrix-matrix multiplication. + * Generic algorithm for any data distribution. + */ +static inline void +chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_option_t *options ) +{ + int k, m, n; + int ldam, ldan, ldak, ldbk, ldbm, ldcm; + int tempmm, tempnn, tempkn, tempkm; + + CHAMELEON_Complex64_t zbeta; + CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0; + + for(m = 0; m < C->mt; m++) { + tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldcm = BLKLDD(C, m); + for(n = 0; n < C->nt; n++) { + tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; + /* + * ChamLeft / ChamLower + */ + if (side == ChamLeft) { + ldam = BLKLDD(A, m); + if (uplo == ChamLower) { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + ldak = BLKLDD(A, k); + ldbk = BLKLDD(B, k); + zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(m, k), ldam, /* lda * K */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { if (k == m) { INSERT_TASK_zsymm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, k), ldak, /* ldak * X */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { INSERT_TASK_zgemm( - &options, + options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, m), ldak, /* ldak * X */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } } } - /* - * ChamLeft / ChamUpper - */ - else { + } + /* + * ChamLeft / ChamUpper + */ + else { + for (k = 0; k < C->mt; k++) { + tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb; + ldak = BLKLDD(A, k); + ldbk = BLKLDD(B, k); + zbeta = k == 0 ? beta : zone; if (k < m) { INSERT_TASK_zgemm( - &options, + options, ChamTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(k, m), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, m), ldak, /* ldak * X */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { if (k == m) { INSERT_TASK_zsymm( - &options, + options, side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(k, k), ldak, /* ldak * K */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkm, A->mb, - alpha, A(m, k), ldam, - B(k, n), ldbk, - zbeta, C(m, n), ldcm); + alpha, A(m, k), ldam, /* lda * K */ + B(k, n), ldbk, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } } } } - RUNTIME_data_flush( sequence, B(k, n) ); - } - if (uplo == ChamLower) { - for (n = 0; n <= k; n++) { - RUNTIME_data_flush( sequence, A(k, n) ); - } } + /* + * ChamRight / ChamLower + */ else { - for (m = 0; m <= k; m++) { - RUNTIME_data_flush( sequence, A(m, k) ); - } - } - } - } - /* - * ChamRight - */ - else { - for (k = 0; k < C->nt; k++) { - tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; - ldak = BLKLDD(A, k); - zbeta = k == 0 ? beta : zone; - - for (m = 0; m < C->mt; m++) { - tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb; + ldan = BLKLDD(A, n); ldbm = BLKLDD(B, m); - ldcm = BLKLDD(C, m); - - for (n = 0; n < C->nt; n++) { - tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb; - ldan = BLKLDD(A, n); - - /* - * ChamRight / ChamLower - */ - if (uplo == ChamLower) { + if (uplo == ChamLower) { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + ldak = BLKLDD(A, k); + zbeta = k == 0 ? beta : zone; if (k < n) { - INSERT_TASK_zgemm( - &options, - ChamNoTrans, ChamTrans, - tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(n, k), ldan, - zbeta, C(m, n), ldcm); + INSERT_TASK_zgemm( + options, + ChamNoTrans, ChamTrans, + tempmm, tempnn, tempkn, A->mb, + alpha, B(m, k), ldbm, /* ldb * K */ + A(n, k), ldan, /* lda * K */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { if (k == n) { - INSERT_TASK_zsymm( - &options, - ChamRight, uplo, - tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, - B(m, k), ldbm, - zbeta, C(m, n), ldcm); + INSERT_TASK_zsymm( + options, + side, uplo, + tempmm, tempnn, A->mb, + alpha, A(k, k), ldak, /* ldak * Y */ + B(m, k), ldbm, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(k, n), ldak, - zbeta, C(m, n), ldcm); + alpha, B(m, k), ldbm, /* ldb * K */ + A(k, n), ldak, /* ldak * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } } } - /* - * ChamRight / ChamUpper - */ - else { + } + /* + * ChamRight / ChamUpper + */ + else { + for (k = 0; k < C->nt; k++) { + tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb; + ldak = BLKLDD(A, k); + zbeta = k == 0 ? beta : zone; if (k < n) { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamNoTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(k, n), ldak, - zbeta, C(m, n), ldcm); + alpha, B(m, k), ldbm, /* ldb * K */ + A(k, n), ldak, /* ldak * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { if (k == n) { INSERT_TASK_zsymm( - &options, - ChamRight, uplo, + options, + side, uplo, tempmm, tempnn, A->mb, - alpha, A(k, k), ldak, - B(m, k), ldbm, - zbeta, C(m, n), ldcm); + alpha, A(k, k), ldak, /* ldak * Y */ + B(m, k), ldbm, /* ldb * Y */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } else { INSERT_TASK_zgemm( - &options, + options, ChamNoTrans, ChamTrans, tempmm, tempnn, tempkn, A->mb, - alpha, B(m, k), ldbm, - A(n, k), ldan, - zbeta, C(m, n), ldcm); + alpha, B(m, k), ldbm, /* ldb * K */ + A(n, k), ldan, /* lda * K */ + zbeta, C(m, n), ldcm); /* ldc * Y */ } } } } - RUNTIME_data_flush( sequence, B(m, k) ); - } - if (uplo == ChamLower) { - for (n = 0; n <= k; n++) { - RUNTIME_data_flush( sequence, A(k, n) ); - } - } - else { - for (m = 0; m <= k; m++) { - RUNTIME_data_flush( sequence, A(m, k) ); - } } } } - RUNTIME_options_finalize(&options, chamctxt); + (void)chamctxt; +} + +/** + * Parallel tile symmetric matrix-matrix multiplication. wrapper. + */ +void +chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, + CHAMELEON_Complex64_t beta, CHAM_desc_t *C, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) +{ + CHAM_context_t *chamctxt; + RUNTIME_option_t options; + + chamctxt = chameleon_context_self(); + if (sequence->status != CHAMELEON_SUCCESS) { + return; + } + RUNTIME_options_init( &options, chamctxt, sequence, request ); + + if ( ((C->p > 1) || (C->q > 1)) && + (C->get_rankof == chameleon_getrankof_2d) && + (chamctxt->generic_enabled != CHAMELEON_TRUE) ) + { + chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + else { + chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options ); + } + + RUNTIME_options_finalize( &options, chamctxt ); } diff --git a/control/context.c b/control/context.c index 306e42215cf7e96f0d7b8755e27b307363dcbdc0..d57b0f1e697f6c423b7146929f01092d32bf0599 100644 --- a/control/context.c +++ b/control/context.c @@ -62,6 +62,7 @@ CHAM_context_t *chameleon_context_create() chamctxt->nb = 128; chamctxt->ib = 32; chamctxt->rhblock = 4; + chamctxt->lookahead = 3; chamctxt->nworkers = 1; chamctxt->ncudas = 0; @@ -72,11 +73,11 @@ CHAM_context_t *chameleon_context_create() chamctxt->parallel_enabled = CHAMELEON_FALSE; chamctxt->profiling_enabled = CHAMELEON_FALSE; chamctxt->progress_enabled = CHAMELEON_FALSE; + chamctxt->generic_enabled = CHAMELEON_FALSE; chamctxt->householder = ChamFlatHouseholder; chamctxt->translation = ChamOutOfPlace; - /* Initialize scheduler */ RUNTIME_context_create(chamctxt); @@ -120,6 +121,7 @@ int chameleon_context_destroy(){ * @arg CHAMELEON_PROFILING_MODE activate profiling of kernels * @arg CHAMELEON_PROGRESS activate progress indicator * @arg CHAMELEON_GEMM3M Use z/cgemm3m for complexe matrix-matrix products + * @arg CHAMELEON_GENERIC enable/disable GEMM3M Use z/cgemm3m for complexe matrix-matrix products * ******************************************************************************* * @@ -160,6 +162,9 @@ int CHAMELEON_Enable(int option) /* case CHAMELEON_PARALLEL: */ /* chamctxt->parallel_enabled = CHAMELEON_TRUE; */ /* break; */ + case CHAMELEON_GENERIC: + chamctxt->generic_enabled = CHAMELEON_TRUE; + break; default: chameleon_error("CHAMELEON_Enable", "illegal parameter value"); return CHAMELEON_ERR_ILLEGAL_VALUE; @@ -225,6 +230,9 @@ int CHAMELEON_Disable(int option) case CHAMELEON_PARALLEL_MODE: chamctxt->parallel_enabled = CHAMELEON_FALSE; break; + case CHAMELEON_GENERIC: + chamctxt->generic_enabled = CHAMELEON_FALSE; + break; default: chameleon_error("CHAMELEON_Disable", "illegal parameter value"); return CHAMELEON_ERR_ILLEGAL_VALUE; @@ -248,6 +256,7 @@ int CHAMELEON_Disable(int option) * Feature to be enabled: * @arg CHAMELEON_TILE_SIZE: size matrix tile, * @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block, + * @arg CHAMELEON_LOOKAHEAD: depth of the look ahead in algorithms * * @param[in] value * Value of the parameter. @@ -321,6 +330,13 @@ int CHAMELEON_Set( int param, int value ) } chamctxt->translation = value; break; + case CHAMELEON_LOOKAHEAD: + if (value < 1) { + chameleon_error("CHAMELEON_Set", "illegal value of CHAMELEON_LOOKAHEAD"); + return CHAMELEON_ERR_ILLEGAL_VALUE; + } + chamctxt->lookahead = value; + break; default: chameleon_error("CHAMELEON_Set", "unknown parameter"); return CHAMELEON_ERR_ILLEGAL_VALUE; @@ -341,6 +357,7 @@ int CHAMELEON_Set( int param, int value ) * Feature to be enabled: * @arg CHAMELEON_TILE_SIZE: size matrix tile, * @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block, + * @arg CHAMELEON_LOOKAHEAD: depth of the look ahead in algorithms * * @param[out] value * Value of the parameter. @@ -375,6 +392,9 @@ int CHAMELEON_Get(int param, int *value) case CHAMELEON_TRANSLATION_MODE: *value = chamctxt->translation; return CHAMELEON_SUCCESS; + case CHAMELEON_LOOKAHEAD: + *value = chamctxt->lookahead; + return CHAMELEON_SUCCESS; default: chameleon_error("CHAMELEON_Get", "unknown parameter"); return CHAMELEON_ERR_ILLEGAL_VALUE; diff --git a/control/descriptor.c b/control/descriptor.c index de4bd65c032d661372453d319e9659a58bb1faab..ad2d53f2859da764e7156e9872f5898f985214a3 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -68,6 +68,27 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc ) return CHAMELEON_SUCCESS; } +/** + * Internal function to return MPI rank of element A(m,n) with m,n = block indices + */ +int chameleon_getrankof_2d( const CHAM_desc_t *A, int m, int n ) +{ + int mm = m + A->i / A->mb; + int nn = n + A->j / A->nb; + return (mm % A->p) * A->q + (nn % A->q); +} + +/** + * Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices + */ +int chameleon_getrankof_2d_diag( const CHAM_desc_t *A, int m, int n ) +{ + int mm = m + A->i / A->mb; + assert( m == n ); + return (mm % A->p) * A->q + (mm % A->q); +} + + /** ****************************************************************************** * diff --git a/control/descriptor.h b/control/descriptor.h index 2be4cafe9c59f7a13042b54e6a99bf0b265bd00c..6eff677110b0e4269a5366cc7ffcf24269050547 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -44,8 +44,8 @@ inline static int chameleon_getblkldd_ccrb(const CHAM_desc_t *A, int m); /** * Data distributions */ -inline static int chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n); -inline static int chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n); +int chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n); +int chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n); int chameleon_desc_init ( CHAM_desc_t *desc, void *mat, cham_flttype_t dtyp, int mb, int nb, int bsiz, @@ -175,27 +175,6 @@ inline static int chameleon_getblkldd_cm(const CHAM_desc_t *A, int m) { return A->llm; } -/** - * Internal function to return MPI rank of element A(m,n) with m,n = block indices - */ -inline static int chameleon_getrankof_2d(const CHAM_desc_t *A, int m, int n) -{ - int mm = m + A->i / A->mb; - int nn = n + A->j / A->nb; - return (mm % A->p) * A->q + (nn % A->q); -} - -/** - * Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices - */ -inline static int chameleon_getrankof_2d_diag(const CHAM_desc_t *A, int m, int n) -{ - int mm = m + A->i / A->mb; - assert( m == n ); - return (mm % A->p) * A->q + (mm % A->q); -} - - /** * Detect if the tile is local or not */ diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index ec7ea257144d0f1fc0730feb2f5817b98cd956f0..968a259cc3470d1f409eeea0b1b2153e06c7e89b 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -182,6 +182,7 @@ typedef enum chameleon_store_e { #define CHAMELEON_BOUND 7 #define CHAMELEON_PROGRESS 8 #define CHAMELEON_GEMM3M 9 +#define CHAMELEON_GENERIC 10 /** * CHAMELEON constants - configuration parameters @@ -192,6 +193,7 @@ typedef enum chameleon_store_e { #define CHAMELEON_HOUSEHOLDER_MODE 5 #define CHAMELEON_HOUSEHOLDER_SIZE 6 #define CHAMELEON_TRANSLATION_MODE 7 +#define CHAMELEON_LOOKAHEAD 8 /** * @brief QR/LQ factorization trees diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h index 51315359bc8b6dad57bf577e9ea23e6bd6c6cada..cbd860c6da74bc5b5d5205506125f92d08c04720 100644 --- a/include/chameleon/struct.h +++ b/include/chameleon/struct.h @@ -116,6 +116,7 @@ typedef struct chameleon_context_s { cham_bool_t parallel_enabled; cham_bool_t profiling_enabled; cham_bool_t progress_enabled; + cham_bool_t generic_enabled; cham_householder_t householder; // "domino" (flat) or tree-based (reduction) Householder cham_translation_t translation; // In place or Out of place layout conversion @@ -123,6 +124,7 @@ typedef struct chameleon_context_s { int nb; int ib; int rhblock; // block size for tree-based (reduction) Householder + int lookahead; // depth of the look ahead in algorithms void *schedopt; // structure for runtimes int mpi_outer_init; // MPI has been initialized outside our functions } CHAM_context_t; diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c index 69a29fe31a1d1eba37d6e43be438b201c6e3d651..002be36104b8f3bb59ab7012bf31e35a4336716d 100644 --- a/runtime/starpu/control/runtime_context.c +++ b/runtime/starpu/control/runtime_context.c @@ -85,6 +85,8 @@ void RUNTIME_enable( void *runtime_ctxt, int lever ) default: return; } + + (void)runtime_ctxt; return; } @@ -107,5 +109,7 @@ void RUNTIME_disable( void *runtime_ctxt, int lever ) default: return; } + + (void)runtime_ctxt; return; } diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c index acba26abf071937a0d216ca1c9f3ccde22301955..78d2394efa8d6fa4f2fd0a73ce562119faf5d7a0 100644 --- a/runtime/starpu/control/runtime_profiling.c +++ b/runtime/starpu/control/runtime_profiling.c @@ -56,7 +56,7 @@ void RUNTIME_iteration_pop( CHAM_context_t *chamctxt ) void RUNTIME_start_profiling(){ #if defined(HAVE_STARPU_FXT_PROFILING) - starpu_fxt_start_profiling(); + starpu_fxt_start_profiling(); #else fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n"); #endif @@ -64,7 +64,7 @@ void RUNTIME_start_profiling(){ void RUNTIME_stop_profiling(){ #if defined(HAVE_STARPU_FXT_PROFILING) - starpu_fxt_stop_profiling(); + starpu_fxt_stop_profiling(); #else fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n"); #endif diff --git a/testing/testing_dgram.c b/testing/testing_dgram.c index c9ca38a3f7a3475e257f41d2a0c8343083efabcb..0bb2c73039c7b890fa8f674f86a3122e3ffa71ca 100644 --- a/testing/testing_dgram.c +++ b/testing/testing_dgram.c @@ -212,7 +212,6 @@ static int compute_gram_sequential(cham_uplo_t uplo, int LDA) { int m, n; - double eps; double squareij, mean_dij, mhalf; double *work = (double *)malloc(N * sizeof(double)); @@ -258,4 +257,4 @@ static int compute_gram_sequential(cham_uplo_t uplo, free(work); return 0; -} \ No newline at end of file +} diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt index cbe827665a7af49d0f1e77d7445b0ca350937a5a..8167545fd74905693d3d4c4d2ba9ef9d516eb952 100644 --- a/timing/CMakeLists.txt +++ b/timing/CMakeLists.txt @@ -103,6 +103,8 @@ set(ZSRC_LAP_INT set(ZSRC_TIL_INT # BLAS 3 time_zgemm_tile.c + time_zhemm_tile.c + time_zsymm_tile.c # LAPACK time_zgels_tile.c time_zgeqrf_hqr_tile.c diff --git a/timing/time_zhemm_tile.c b/timing/time_zhemm_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..04c16208a2745494de2e9aa2abf84e4194a16d74 --- /dev/null +++ b/timing/time_zhemm_tile.c @@ -0,0 +1,87 @@ +/** + * + * @file time_zhemm_tile.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @version 0.9.2 + * @author Mathieu Faverge + * @date 2014-11-16 + * @precisions normal z -> c + * + */ +#define _TYPE CHAMELEON_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "CHAMELEON_zhemm_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_HEMM( ChamLeft, M, N ) +#define _FADDS FADDS_HEMM( ChamLeft, M, N ) + +#include "./timing.c" +#include "timing_zauxiliary.h" + +static int +RunTest(int *iparam, double *dparam, chameleon_time_t *t_) +{ + CHAMELEON_Complex64_t alpha, beta; + PASTE_CODE_IPARAM_LOCALS( iparam ); + + LDA = chameleon_max(M, iparam[IPARAM_LDA]); + LDB = chameleon_max(M, iparam[IPARAM_LDB]); + LDC = chameleon_max(M, iparam[IPARAM_LDC]); + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N ); + + /* Initialize Data */ + CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 ); + CHAMELEON_zplrnt_Tile( descB, 7672 ); + CHAMELEON_zplrnt_Tile( descC, 6387 ); + +#if !defined(CHAMELEON_SIMULATION) + LAPACKE_zlarnv_work(1, ISEED, 1, &alpha); + LAPACKE_zlarnv_work(1, ISEED, 1, &beta); +#else + alpha = 1.5; + beta = -2.3; +#endif + + /* Save C for check */ + PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N ); + + START_TIMING(); + CHAMELEON_zhemm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC ); + STOP_TIMING(); + +#if !defined(CHAMELEON_SIMULATION) + /* Check the solution */ + if (check) + { + PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M ); + PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N ); + PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N ); + + dparam[IPARAM_RES] = z_check_hemm( ChamLeft, ChamUpper, M, N, + alpha, A, LDA, B, LDB, beta, C, C2, LDC, + &(dparam[IPARAM_ANORM]), + &(dparam[IPARAM_BNORM]), + &(dparam[IPARAM_XNORM]) ); + + free(A); free(B); free(C); free(C2); + } +#endif + + PASTE_CODE_FREE_MATRIX( descA ); + PASTE_CODE_FREE_MATRIX( descB ); + PASTE_CODE_FREE_MATRIX( descC ); + return 0; +} diff --git a/timing/time_zsymm_tile.c b/timing/time_zsymm_tile.c new file mode 100644 index 0000000000000000000000000000000000000000..0675e238c999f94e1dc902caded132415e059a6d --- /dev/null +++ b/timing/time_zsymm_tile.c @@ -0,0 +1,87 @@ +/** + * + * @file time_zsymm_tile.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @version 0.9.2 + * @author Mathieu Faverge + * @date 2014-11-16 + * @precisions normal z -> c d s + * + */ +#define _TYPE CHAMELEON_Complex64_t +#define _PREC double +#define _LAMCH LAPACKE_dlamch_work + +#define _NAME "CHAMELEON_zsymm_Tile" +/* See Lawn 41 page 120 */ +#define _FMULS FMULS_SYMM( ChamLeft, M, N ) +#define _FADDS FADDS_SYMM( ChamLeft, M, N ) + +#include "./timing.c" +#include "timing_zauxiliary.h" + +static int +RunTest(int *iparam, double *dparam, chameleon_time_t *t_) +{ + CHAMELEON_Complex64_t alpha, beta; + PASTE_CODE_IPARAM_LOCALS( iparam ); + + LDA = chameleon_max(M, iparam[IPARAM_LDA]); + LDB = chameleon_max(M, iparam[IPARAM_LDB]); + LDC = chameleon_max(M, iparam[IPARAM_LDC]); + + /* Allocate Data */ + PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N ); + PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N ); + + /* Initialize Data */ + CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 ); + CHAMELEON_zplrnt_Tile( descB, 7672 ); + CHAMELEON_zplrnt_Tile( descC, 6387 ); + +#if !defined(CHAMELEON_SIMULATION) + LAPACKE_zlarnv_work(1, ISEED, 1, &alpha); + LAPACKE_zlarnv_work(1, ISEED, 1, &beta); +#else + alpha = 1.5; + beta = -2.3; +#endif + + /* Save C for check */ + PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N ); + + START_TIMING(); + CHAMELEON_zsymm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC ); + STOP_TIMING(); + +#if !defined(CHAMELEON_SIMULATION) + /* Check the solution */ + if (check) + { + PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M ); + PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N ); + PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N ); + + dparam[IPARAM_RES] = z_check_symm( ChamLeft, ChamUpper, M, N, + alpha, A, LDA, B, LDB, beta, C, C2, LDC, + &(dparam[IPARAM_ANORM]), + &(dparam[IPARAM_BNORM]), + &(dparam[IPARAM_XNORM]) ); + + free(A); free(B); free(C); free(C2); + } +#endif + + PASTE_CODE_FREE_MATRIX( descA ); + PASTE_CODE_FREE_MATRIX( descB ); + PASTE_CODE_FREE_MATRIX( descC ); + return 0; +} diff --git a/timing/timing_zauxiliary.c b/timing/timing_zauxiliary.c index 007d57c85ef9f7e0b50817a179b3404dcf069141..b8c995e34f2a6b18a60ecd70e2d68ba981c5f1b7 100644 --- a/timing/timing_zauxiliary.c +++ b/timing/timing_zauxiliary.c @@ -247,6 +247,70 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int return Rnorm; } +#if defined(PRECISION_z) || defined(PRECISION_c) +/*-------------------------------------------------------------- + * Check the hemm + */ +double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham, + CHAMELEON_Complex64_t *Cref, int LDC, + double *Cinitnorm, double *Cchamnorm, double *Clapacknorm ) +{ + CHAMELEON_Complex64_t beta_const = -1.0; + double Rnorm; + double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) ); + + *Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + *Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work ); + + cblas_zhemm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N, + CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC ); + + *Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + + cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 ); + + Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + + free(work); + + return Rnorm; +} +#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + +/*-------------------------------------------------------------- + * Check the symm + */ +double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham, + CHAMELEON_Complex64_t *Cref, int LDC, + double *Cinitnorm, double *Cchamnorm, double *Clapacknorm ) +{ + CHAMELEON_Complex64_t beta_const = -1.0; + double Rnorm; + double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) ); + + *Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + *Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work ); + + cblas_zsymm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N, + CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC ); + + *Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + + cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 ); + + Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work ); + + free(work); + + return Rnorm; +} + /*-------------------------------------------------------------- * Check the trsm */ diff --git a/timing/timing_zauxiliary.h b/timing/timing_zauxiliary.h index 6fbb0ff15a89aca664486166c55272a3b50b5468..ed9c7df0bf5f4f13da61317923821c39f38607d4 100644 --- a/timing/timing_zauxiliary.h +++ b/timing/timing_zauxiliary.h @@ -28,6 +28,21 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int CHAMELEON_Complex64_t *Cref, int LDC, double *Cinitnorm, double *Cchamnorm, double *Clapacknorm ); +#if defined(PRECISION_z) || defined(PRECISION_c) +double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham, + CHAMELEON_Complex64_t *Cref, int LDC, + double *Cinitnorm, double *Cchamnorm, double *Clapacknorm ); +#endif +double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N, + CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA, + const CHAMELEON_Complex64_t *B, int LDB, + CHAMELEON_Complex64_t beta, const CHAMELEON_Complex64_t *Ccham, + CHAMELEON_Complex64_t *Cref, int LDC, + double *Cinitnorm, double *Cchamnorm, double *Clapacknorm ); + double z_check_trsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, cham_diag_t diag, int M, int NRHS, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,