diff --git a/compute/pzlaset.c b/compute/pzlaset.c index 4bac0246c985f492a613f79c97cff75c4e2bb48e..d874f8dd8798fb72a3bc1536ab55dc1e4e47a698 100644 --- a/compute/pzlaset.c +++ b/compute/pzlaset.c @@ -29,10 +29,10 @@ * Parallel initialization a 2-D array A to BETA on the diagonal and * ALPHA on the offdiagonals. */ -void chameleon_pzlaset(cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, - CHAM_desc_t *A, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request) +void chameleon_pzlaset( cham_uplo_t uplo, + CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, + CHAM_desc_t *A, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; @@ -71,18 +71,28 @@ void chameleon_pzlaset(cham_uplo_t uplo, } } else if (uplo == ChamUpper) { - for (j = 0; j < A->nt; j++){ - tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; - for (i = 0; i < chameleon_min(j+1, A->mt); i++){ - tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; - ldai = BLKLDD(A, i); - INSERT_TASK_zlaset( - &options, - ChamUpperLower, tempim, tempjn, - alpha, (i == j) ? beta : alpha, - A(i, j), ldai); - } - } + for (i = 0; i < A->mt; i++) { + tempim = i == A->mt-1 ? A->m-i*A->mb : A->mb; + ldai = BLKLDD(A, i); + + if ( i < A->nt ) { + j = i; + tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; + + INSERT_TASK_zlaset( + &options, + uplo, tempim, tempjn, + alpha, beta, A(i, j), ldai); + } + for (j = i+1; j < A->nt; j++) { + tempjn = j == A->nt-1 ? A->n-j*A->nb : A->nb; + + INSERT_TASK_zlaset( + &options, + ChamUpperLower, tempim, tempjn, + alpha, alpha, A(i, j), ldai); + } + } } else { for (i = 0; i < A->mt; i++){ diff --git a/compute/pzplghe.c b/compute/pzplghe.c index f2896cf94fcb49b83acbf5bf9d87c4a1798884c6..27ae7d355601139cef866aca8ceca1e3be7d99ee 100644 --- a/compute/pzplghe.c +++ b/compute/pzplghe.c @@ -12,9 +12,6 @@ * @brief Chameleon zplghe parallel algorithm * * @version 0.9.2 - * @comment This file is a copy from pzplghe.c - * wich has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -31,13 +28,13 @@ * chameleon_pzplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles. */ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, - unsigned long long int seed, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + unsigned long long int seed, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; - int m, n; + int m, n, minmn; int ldam; int tempmm, tempnn; @@ -47,16 +44,15 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, } RUNTIME_options_init(&options, chamctxt, sequence, request); - for (m = 0; m < A->mt; m++) { - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); + minmn = chameleon_min( A->mt, A->nt ); + switch ( uplo ) { + case ChamLower: + for (n = 0; n < minmn; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - /* - * ChamLower - */ - if (uplo == ChamLower) { - for (n = 0; n <= m; n++) { - tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + for (m = n; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); options.priority = m + n; INSERT_TASK_zplghe( @@ -65,10 +61,13 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, A->m, m*A->mb, n*A->nb, seed ); } } - /* - * ChamUpper - */ - else if (uplo == ChamUpper) { + break; + + case ChamUpper: + for (m = 0; m < minmn; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + for (n = m; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -79,13 +78,17 @@ void chameleon_pzplghe( double bump, cham_uplo_t uplo, CHAM_desc_t *A, A->m, m*A->mb, n*A->nb, seed ); } } - /* - * ChamUpperLower - */ - else { + + case ChamUpperLower: + default: + for (m = 0; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + options.priority = m + n; INSERT_TASK_zplghe( &options, bump, tempmm, tempnn, A(m, n), ldam, diff --git a/compute/pzplgsy.c b/compute/pzplgsy.c index e9877e76610d88494098fc3cbfeadd263af1df59..38bc8fcba6f6dd5831f0a5493efb6bf8400df559 100644 --- a/compute/pzplgsy.c +++ b/compute/pzplgsy.c @@ -12,9 +12,6 @@ * @brief Chameleon zplgsy parallel algorithm * * @version 0.9.2 - * @comment This file is a copy of pzplgsy.c, - wich has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede @@ -31,13 +28,13 @@ * chameleon_pzplgsy - Generate a random symmetric (positive definite if 'bump' is large enough) half-matrix by tiles. */ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_t *A, - unsigned long long int seed, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + unsigned long long int seed, + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; RUNTIME_option_t options; - int m, n; + int m, n, minmn; int ldam; int tempmm, tempnn; @@ -47,16 +44,15 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ } RUNTIME_options_init(&options, chamctxt, sequence, request); - for (m = 0; m < A->mt; m++) { - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - ldam = BLKLDD(A, m); + minmn = chameleon_min( A->mt, A->nt ); + switch ( uplo ) { + case ChamLower: + for (n = 0; n < minmn; n++) { + tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - /* - * ChamLower - */ - if (uplo == ChamLower) { - for (n = 0; n <= m; n++) { - tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + for (m = n; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); options.priority = m + n; INSERT_TASK_zplgsy( @@ -65,10 +61,13 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ A->m, m*A->mb, n*A->nb, seed ); } } - /* - * ChamUpper - */ - else if (uplo == ChamUpper) { + break; + + case ChamUpper: + for (m = 0; m < minmn; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + for (n = m; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; @@ -79,13 +78,17 @@ void chameleon_pzplgsy( CHAMELEON_Complex64_t bump, cham_uplo_t uplo, CHAM_desc_ A->m, m*A->mb, n*A->nb, seed ); } } - /* - * ChamUpperLower - */ - else { + + case ChamUpperLower: + default: + for (m = 0; m < A->mt; m++) { + tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; + ldam = BLKLDD(A, m); + for (n = 0; n < A->nt; n++) { tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; + options.priority = m + n; INSERT_TASK_zplgsy( &options, bump, tempmm, tempnn, A(m, n), ldam, diff --git a/compute/zlascal.c b/compute/zlascal.c index be031b90a78a97303d6ec98b06f16bd55737b277..4aa20ab8bf63772f58e989cd693e2539bee4451a 100644 --- a/compute/zlascal.c +++ b/compute/zlascal.c @@ -179,7 +179,7 @@ int CHAMELEON_zlascal( cham_uplo_t uplo, int M, int N, * */ int CHAMELEON_zlascal_Tile( cham_uplo_t uplo, - CHAMELEON_Complex64_t alpha, CHAM_desc_t *A ) + CHAMELEON_Complex64_t alpha, CHAM_desc_t *A ) { CHAM_context_t *chamctxt; RUNTIME_sequence_t *sequence = NULL; diff --git a/compute/zpotrf.c b/compute/zpotrf.c index 41093b0ca3e030754b7aa0e7931c85e5d1bb4994..974312ddf660297bb08e151a305667c9734764fe 100644 --- a/compute/zpotrf.c +++ b/compute/zpotrf.c @@ -243,7 +243,7 @@ int CHAMELEON_zpotrf_Tile( cham_uplo_t uplo, CHAM_desc_t *A ) * */ int CHAMELEON_zpotrf_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, - RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) + RUNTIME_sequence_t *sequence, RUNTIME_request_t *request ) { CHAM_context_t *chamctxt; diff --git a/coreblas/compute/core_zgeadd.c b/coreblas/compute/core_zgeadd.c index 2707f0fc8201591a0587d2420dfb4609069e6731..4882a79cf2152eca6eaa05430f77c0f98261ba4c 100644 --- a/coreblas/compute/core_zgeadd.c +++ b/coreblas/compute/core_zgeadd.c @@ -21,6 +21,7 @@ * */ #include "coreblas.h" +#include "coreblas/lapacke.h" /** ****************************************************************************** @@ -112,12 +113,21 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, return -8; } + if ( beta == 0. ) { + LAPACKE_zlaset_work( LAPACK_COL_MAJOR, 'A', + M, N, 0., 0., B, LDB ); + } + else if ( beta != 1. ) { + LAPACKE_zlascl_work( LAPACK_COL_MAJOR, 'G', + 0, 0, 1., beta, M, N, B, LDB ); + } + switch( trans ) { #if defined(PRECISION_z) || defined(PRECISION_c) case ChamConjTrans: for (j=0; j<N; j++, A++) { for(i=0; i<M; i++, B++) { - *B = beta * (*B) + alpha * conj(A[LDA*i]); + *B += alpha * conj(A[LDA*i]); } B += LDB-M; } @@ -127,7 +137,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, case ChamTrans: for (j=0; j<N; j++, A++) { for(i=0; i<M; i++, B++) { - *B = beta * (*B) + alpha * A[LDA*i]; + *B += alpha * A[LDA*i]; } B += LDB-M; } @@ -137,7 +147,7 @@ int CORE_zgeadd(cham_trans_t trans, int M, int N, default: for (j=0; j<N; j++) { for(i=0; i<M; i++, B++, A++) { - *B = beta * (*B) + alpha * (*A); + *B += alpha * (*A); } A += LDA-M; B += LDB-M; diff --git a/coreblas/compute/core_zplghe.c b/coreblas/compute/core_zplghe.c index 70be1636cb086edf46dd41ea14630d70b92a4146..d6ca0bc739ee126830d96f322ecc851dfe6b3576 100644 --- a/coreblas/compute/core_zplghe.c +++ b/coreblas/compute/core_zplghe.c @@ -78,10 +78,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, * Tile diagonal */ if ( m0 == n0 ) { - for (j = 0; j < n; j++) { + int minmn = chameleon_min( m, n ); + + /* Lower part */ + for (j = 0; j < minmn; j++) { ran = Rnd64_jump( NBELEM * jump, seed ); - for (i = j; i < m; i++) { + *tmp = 0.5f - ran * RndF_Mul; + ran = Rnd64_A * ran + Rnd64_C; +#if defined(PRECISION_z) || defined(PRECISION_c) + ran = Rnd64_A * ran + Rnd64_C; +#endif + *tmp = creal( *tmp + bump ); + tmp++; + + for (i = j+1; i < m; i++) { *tmp = 0.5f - ran * RndF_Mul; ran = Rnd64_A * ran + Rnd64_C; #if defined(PRECISION_z) || defined(PRECISION_c) @@ -94,16 +105,21 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, jump += bigM + 1; } - for (j = 0; j < n; j++) { + /* Upper part */ + jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)bigM; + + for (i = 0; i < minmn; i++) { + ran = Rnd64_jump( NBELEM * (jump+i+1), seed ); + + for (j = i+1; j < n; j++) { + A[j*lda+i] = 0.5f - ran * RndF_Mul; + ran = Rnd64_A * ran + Rnd64_C; #if defined(PRECISION_z) || defined(PRECISION_c) - A[j+j*lda] += bump - I*cimag( A[j+j*lda] ); -#else - A[j+j*lda] += bump; + A[j*lda+i] -= I*(0.5f - ran * RndF_Mul); + ran = Rnd64_A * ran + Rnd64_C; #endif - - for (i=0; i<j; i++) { - A[lda*j+i] = conj( A[lda*i+j] ); } + jump += bigM; } } /* @@ -148,6 +164,3 @@ void CORE_zplghe( double bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, } } } - - - diff --git a/coreblas/compute/core_zplgsy.c b/coreblas/compute/core_zplgsy.c index dbd252066e11bf1bf6db69dab951dd56cf5f2cbb..f92f90a8d77edab31822031785b6cf5c3eec035e 100644 --- a/coreblas/compute/core_zplgsy.c +++ b/coreblas/compute/core_zplgsy.c @@ -63,7 +63,6 @@ Rnd64_jump(unsigned long long int n, unsigned long long int seed ) { return ran; } - // CORE_zplgsy - Generate a tile for random symmetric (positive definite if 'bump' is large enough) matrix. void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t *A, int lda, @@ -79,10 +78,22 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ * Tile diagonal */ if ( m0 == n0 ) { - for (j = 0; j < n; j++) { + int minmn = chameleon_min( m, n ); + + /* Lower part */ + for (j = 0; j < minmn; j++) { ran = Rnd64_jump( NBELEM * jump, seed ); - for (i = j; i < m; i++) { + *tmp = 0.5f - ran * RndF_Mul; + ran = Rnd64_A * ran + Rnd64_C; +#if defined(PRECISION_z) || defined(PRECISION_c) + *tmp += I*(0.5f - ran * RndF_Mul); + ran = Rnd64_A * ran + Rnd64_C; +#endif + *tmp = *tmp + bump; + tmp++; + + for (i = j+1; i < m; i++) { *tmp = 0.5f - ran * RndF_Mul; ran = Rnd64_A * ran + Rnd64_C; #if defined(PRECISION_z) || defined(PRECISION_c) @@ -95,12 +106,21 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ jump += bigM + 1; } - for (j = 0; j < n; j++) { - A[j+j*lda] += bump; + /* Upper part */ + jump = (unsigned long long int)m0 + (unsigned long long int)n0 * (unsigned long long int)bigM; - for (i=0; i<j; i++) { - A[lda*j+i] = A[lda*i+j]; + for (i = 0; i < minmn; i++) { + ran = Rnd64_jump( NBELEM * (jump+i+1), seed ); + + for (j = i+1; j < n; j++) { + A[j*lda+i] = 0.5f - ran * RndF_Mul; + ran = Rnd64_A * ran + Rnd64_C; +#if defined(PRECISION_z) || defined(PRECISION_c) + A[j*lda+i] += I*(0.5f - ran * RndF_Mul); + ran = Rnd64_A * ran + Rnd64_C; +#endif } + jump += bigM; } } /* @@ -145,6 +165,3 @@ void CORE_zplgsy( CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_ } } } - - - diff --git a/coreblas/compute/core_ztradd.c b/coreblas/compute/core_ztradd.c index 7f18f482a1571eb400d294517a83f8ca3555fff6..09af0f44081c1e626aba9e1dcbb44740c1b3e37f 100644 --- a/coreblas/compute/core_ztradd.c +++ b/coreblas/compute/core_ztradd.c @@ -20,6 +20,7 @@ * */ #include "coreblas.h" +#include "coreblas/lapacke.h" /** ****************************************************************************** @@ -95,7 +96,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t *B, int LDB) { - int i, j; + int i, j, minMN; if (uplo == ChamUpperLower){ int rc = CORE_zgeadd( trans, M, N, alpha, A, LDA, beta, B, LDB ); @@ -137,6 +138,17 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, return -9; } + minMN = chameleon_min( M, N ); + + if ( beta == 0. ) { + LAPACKE_zlaset_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo), + M, N, 0., 0., B, LDB ); + } + else if ( beta != 1. ) { + LAPACKE_zlascl_work( LAPACK_COL_MAJOR, chameleon_lapack_const(uplo), + 0, 0, 1., beta, M, N, B, LDB ); + } + /** * ChamLower */ @@ -144,9 +156,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, switch( trans ) { #if defined(PRECISION_z) || defined(PRECISION_c) case ChamConjTrans: - for (j=0; j<N; j++, A++) { + for (j=0; j<minMN; j++, A++) { for(i=j; i<M; i++, B++) { - *B = beta * (*B) + alpha * conj(A[LDA*i]); + *B += alpha * conj(A[LDA*i]); } B += LDB-M+j+1; } @@ -154,9 +166,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ case ChamTrans: - for (j=0; j<N; j++, A++) { + for (j=0; j<minMN; j++, A++) { for(i=j; i<M; i++, B++) { - *B = beta * (*B) + alpha * A[LDA*i]; + *B += alpha * A[LDA*i]; } B += LDB-M+j+1; } @@ -164,9 +176,9 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, case ChamNoTrans: default: - for (j=0; j<N; j++) { + for (j=0; j<minMN; j++) { for(i=j; i<M; i++, B++, A++) { - *B = beta * (*B) + alpha * (*A); + *B += alpha * (*A); } B += LDB-M+j+1; A += LDA-M+j+1; @@ -183,7 +195,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, for (j=0; j<N; j++, A++) { int mm = chameleon_min( j+1, M ); for(i=0; i<mm; i++, B++) { - *B = beta * (*B) + alpha * conj(A[LDA*i]); + *B += alpha * conj(A[LDA*i]); } B += LDB-mm; } @@ -194,7 +206,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, for (j=0; j<N; j++, A++) { int mm = chameleon_min( j+1, M ); for(i=0; i<mm; i++, B++) { - *B = beta * (*B) + alpha * (A[LDA*i]); + *B += alpha * (A[LDA*i]); } B += LDB-mm; } @@ -205,7 +217,7 @@ int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N, for (j=0; j<N; j++) { int mm = chameleon_min( j+1, M ); for(i=0; i<mm; i++, B++, A++) { - *B = beta * (*B) + alpha * (*A); + *B += alpha * (*A); } B += LDB-mm; A += LDA-mm;