diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c index dcc95ba81a8b8465e753ca77ad30426f71ba7fd4..a8c45399f82897c073438226d45157cd2c8d0eab 100644 --- a/compute/pzgelqf_param.c +++ b/compute/pzgelqf_param.c @@ -95,14 +95,19 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t T = TS; for (i = 0; i < nbgeqrt; i++) { p = qrtree->getm(qrtree, k, i); + + /* /\* We skip the LQ factorization if this is the last diagonal tile *\/ */ + /* if ( (uplo == ChamLower) && (p == k) ) { */ + /* continue; */ + /* } */ + temppn = p == A->nt-1 ? A->n-p*A->nb : A->nb; tempkmin = chameleon_min(tempkm, temppn); INSERT_TASK_zgelqt( &options, tempkm, temppn, ib, T->nb, - A( k, p), - T(k, p)); + A(k, p), T(k, p)); if ( genD ) { int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb; @@ -111,8 +116,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t INSERT_TASK_zlacpy( &options, ChamUpper, tempDkm, tempDpn, A->nb, - A(k, p), - D(k, p) ); + A(k, p), D(k, p) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( &options, @@ -141,14 +145,19 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t for (i = 0; i < nbtiles; i++) { n = tiles[i]; - p = qrtree->currpiv(qrtree, k, n); + p = qrtree->currpiv( qrtree, k, n ); tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb; - if ( qrtree->gettype(qrtree, k, n) == LIBHQR_KILLED_BY_TS ) { + if ( qrtree->gettype( qrtree, k, n ) == LIBHQR_KILLED_BY_TS ) { /* TS kernel */ T = TS; L = 0; + + /* /\* Force TT kernel if this is the last diagonal tile *\/ */ + /* if ( (uplo == ChamLower) && (n == k) ) { */ + /* L = tempnn; */ + /* } */ } else { /* TT kernel */ diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c index 41c882417250f8b28d2a152578397c6ee84a72a0..accf33506175e76baa0dad7319b4919b00afbcc1 100644 --- a/compute/pzgeqrf_param.c +++ b/compute/pzgeqrf_param.c @@ -46,7 +46,7 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, CHAM_desc_t *T; int m, n, i, p; int L, nbgeqrt; - int tempkmin, tempkn, tempnn, tempmm; + int tempkmin, tempkn, tempnn, tempmm, temppm; int node, nbtiles; tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb; @@ -56,35 +56,35 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, T = TS; for (i = 0; i < nbgeqrt; i++) { - m = qrtree->getm( qrtree, k, i ); + p = qrtree->getm( qrtree, k, i ); /* We skip the QR factorization if this is the last diagonal tile */ - if ( (uplo == ChamUpper) && (m == k) ) { + if ( (uplo == ChamUpper) && (p == k) ) { continue; } - tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb; - tempkmin = chameleon_min(tempmm, tempkn); + temppm = p == A->mt-1 ? A->m-p*A->mb : A->mb; + tempkmin = chameleon_min(temppm, tempkn); INSERT_TASK_zgeqrt( options, - tempmm, tempkn, ib, T->nb, - A(m, k), T(m, k) ); + temppm, tempkn, ib, T->nb, + A(p, k), T(p, k) ); if ( genD ) { - int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb; + int tempDpm = p == D->mt-1 ? D->m-p*D->mb : D->mb; int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb; INSERT_TASK_zlacpy( options, - ChamLower, tempDmm, tempDkn, A->nb, - A(m, k), D(m, k) ); + ChamLower, tempDpm, tempDkn, A->nb, + A(p, k), D(p, k) ); #if defined(CHAMELEON_USE_CUDA) INSERT_TASK_zlaset( options, - ChamUpper, tempDmm, tempDkn, + ChamUpper, tempDpm, tempDkn, 0., 1., - D(m, k) ); + D(p, k) ); #endif } @@ -93,16 +93,16 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, INSERT_TASK_zunmqr( options, ChamLeft, ChamConjTrans, - tempmm, tempnn, tempkmin, ib, T->nb, - D(m, k), - T(m, k), - A(m, n)); + temppm, tempnn, tempkmin, ib, T->nb, + D(p, k), + T(p, k), + A(p, n)); } if ( genD || ((k+1) < A->nt)) { - RUNTIME_data_flush( sequence, D(m, k) ); + RUNTIME_data_flush( sequence, D(p, k) ); } - RUNTIME_data_flush( sequence, T(m, k) ); + RUNTIME_data_flush( sequence, T(p, k) ); } /* Setting the order of the tiles */ @@ -151,7 +151,7 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib, INSERT_TASK_ztpmqrt( options, ChamLeft, ChamConjTrans, - tempmm, tempnn, A->nb, L, ib, T->nb, + tempmm, tempnn, tempkn, L, ib, T->nb, A(m, k), T(m, k), A(p, n), diff --git a/coreblas/compute/core_zpamm.c b/coreblas/compute/core_zpamm.c index f3b9d3aa429b7299c6427cb79a7a7fb331fae276..d3c750fdbd92953ecf519b6759cfb3d4ac25ab4c 100644 --- a/coreblas/compute/core_zpamm.c +++ b/coreblas/compute/core_zpamm.c @@ -284,18 +284,22 @@ CORE_zpamm(int op, cham_side_t side, cham_store_t storev, } /**/ - if (op==ChameleonW) { + if ( op == ChameleonW ) + { info = CORE_zpamm_w( - side, trans, uplo, M, N, K, L, vi2, vi3, - A1, LDA1, A2, LDA2, V, LDV, W, LDW); - if (info != 0) - return info; - } else if (op==ChameleonA2) { + side, trans, uplo, M, N, K, L, vi2, vi3, + A1, LDA1, A2, LDA2, V, LDV, W, LDW); + } + else if ( op == ChameleonA2 ) + { info = CORE_zpamm_a2( - side, trans, uplo, M, N, K, L, vi2, vi3, - A2, LDA2, V, LDV, W, LDW); - if (info != 0) - return info; + side, trans, uplo, M, N, K, L, vi2, vi3, + A2, LDA2, V, LDV, W, LDW); + } + + + if ( info != 0 ) { + return info; } return CHAMELEON_SUCCESS; @@ -303,28 +307,26 @@ CORE_zpamm(int op, cham_side_t side, cham_store_t storev, /**/ static inline int -CORE_zpamm_w(cham_side_t side, cham_trans_t trans, cham_uplo_t uplo, - int M, int N, int K, int L, - int vi2, int vi3, - const CHAMELEON_Complex64_t *A1, int LDA1, - CHAMELEON_Complex64_t *A2, int LDA2, - const CHAMELEON_Complex64_t *V, int LDV, - CHAMELEON_Complex64_t *W, int LDW) +CORE_zpamm_w( cham_side_t side, cham_trans_t trans, cham_uplo_t uplo, + int M, int N, int K, int L, int vi2, int vi3, + const CHAMELEON_Complex64_t *A1, int LDA1, + CHAMELEON_Complex64_t *A2, int LDA2, + const CHAMELEON_Complex64_t *V, int LDV, + CHAMELEON_Complex64_t *W, int LDW ) { /* * W = A1 + op(V) * A2 or W = A1 + A2 * op(V) */ - int j; - static CHAMELEON_Complex64_t zone = 1.0; - static CHAMELEON_Complex64_t zzero = 0.0; + static CHAMELEON_Complex64_t zone = 1.0; + static CHAMELEON_Complex64_t zzero = 0.0; - if (side == ChamLeft) { - - if (((trans == ChamConjTrans) && (uplo == ChamUpper)) || - ((trans == ChamNoTrans) && (uplo == ChamLower))) { + if ( side == ChamLeft ) { + if ( ((trans == ChamConjTrans) && (uplo == ChamUpper)) || + ((trans == ChamNoTrans) && (uplo == ChamLower)) ) + { /* * W = A1 + V' * A2 */ @@ -445,32 +447,29 @@ CORE_zpamm_w(cham_side_t side, cham_trans_t trans, cham_uplo_t uplo, /**/ static inline int -CORE_zpamm_a2(cham_side_t side, cham_trans_t trans, cham_uplo_t uplo, - int M, int N, int K, int L, - int vi2, int vi3, - CHAMELEON_Complex64_t *A2, int LDA2, - const CHAMELEON_Complex64_t *V, int LDV, - CHAMELEON_Complex64_t *W, int LDW) +CORE_zpamm_a2( cham_side_t side, cham_trans_t trans, cham_uplo_t uplo, + int M, int N, int K, int L, int vi2, int vi3, + CHAMELEON_Complex64_t *A2, int LDA2, + const CHAMELEON_Complex64_t *V, int LDV, + CHAMELEON_Complex64_t *W, int LDW ) { - - /* - * A2 = A2 + op(V) * W or A2 = A2 + W * op(V) - */ - + /* + * A2 = A2 + op(V) * W or A2 = A2 + W * op(V) + */ int j; static CHAMELEON_Complex64_t zone = 1.0; - static CHAMELEON_Complex64_t mzone = -1.0; + static CHAMELEON_Complex64_t mzone = -1.0; - if (side == ChamLeft) { - - if (((trans == ChamConjTrans) && (uplo == ChamUpper)) || - ((trans == ChamNoTrans) && (uplo == ChamLower))) { + if ( side == ChamLeft ) { + if ( ((trans == ChamConjTrans) && (uplo == ChamUpper)) || + ((trans == ChamNoTrans) && (uplo == ChamLower)) ) + { printf("Left Upper/ConjTrans & Lower/NoTrans not implemented yet\n"); return CHAMELEON_ERR_NOT_SUPPORTED; - } - else { //trans + else + { //trans /* * A2 = A2 - V * W diff --git a/coreblas/compute/core_zparfb.c b/coreblas/compute/core_zparfb.c index de9756506e88cd00975d30d3714c2cd881c3e6e1..05775accf2e45a8d26485e31a3636a9e456c121d 100644 --- a/coreblas/compute/core_zparfb.c +++ b/coreblas/compute/core_zparfb.c @@ -12,13 +12,8 @@ * @brief Chameleon core_zparfb CPU kernel * * @version 1.2.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 - * @author Dulceneia Becker - * @author Mathieu Faverge - * @author Emmanuel Agullo - * @author Cedric Castagnede * @author Florent Pruvost + * @author Mathieu Faverge * @date 2022-02-22 * @precisions normal z -> c d s * @@ -134,19 +129,19 @@ ******************************************************************************* * * @retval CHAMELEON_SUCCESS successful exit - * @retval <0 if -i, the i-th argument had an illegal value - * + * @retval <0 if -i, the i-th argument had an illegal value + * @retval The recommended LWORK value, if LWORK == -1 on entry. */ /* This kernel is never traced so return type on previous line for convert2eztrace.pl script */ int -CORE_zparfb(cham_side_t side, cham_trans_t trans, - cham_dir_t direct, cham_store_t storev, - int M1, int N1, int M2, int N2, int K, int L, - CHAMELEON_Complex64_t *A1, int LDA1, - CHAMELEON_Complex64_t *A2, int LDA2, - const CHAMELEON_Complex64_t *V, int LDV, - const CHAMELEON_Complex64_t *T, int LDT, - CHAMELEON_Complex64_t *WORK, int LDWORK) +CORE_zparfb( cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, + int M1, int N1, int M2, int N2, int K, int L, + CHAMELEON_Complex64_t *A1, int LDA1, + CHAMELEON_Complex64_t *A2, int LDA2, + const CHAMELEON_Complex64_t *V, int LDV, + const CHAMELEON_Complex64_t *T, int LDT, + CHAMELEON_Complex64_t *WORK, int LDWORK ) { static CHAMELEON_Complex64_t zone = 1.0; static CHAMELEON_Complex64_t mzone = -1.0; @@ -194,13 +189,13 @@ CORE_zparfb(cham_side_t side, cham_trans_t trans, } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) { return CHAMELEON_SUCCESS; + } if (direct == ChamDirForward) { if (side == ChamLeft) { - /* * Column or Rowwise / Forward / Left * ---------------------------------- diff --git a/coreblas/compute/core_zttmlq.c b/coreblas/compute/core_zttmlq.c index dcd962c008cf2178401916a36bd3ef244d2b32bd..9e7f433fef780d2259b9f1327f85df07cfc3e6f9 100644 --- a/coreblas/compute/core_zttmlq.c +++ b/coreblas/compute/core_zttmlq.c @@ -252,9 +252,9 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans, mi1, ni1, mi2, ni2, kb, l, A1 + LDA1 * jc + ic, LDA1, A2, LDA2, - V + i, LDV, + V + i, LDV, T + LDT * i, LDT, - WORK, LDWORK); + WORK, LDWORK ); } return CHAMELEON_SUCCESS; } diff --git a/coreblas/compute/core_zttmqr.c b/coreblas/compute/core_zttmqr.c index 4770c22c069cc1f764db0a0b8c97a0b94d2f0456..5229c4725a60cb27f530741f49d81257e87ecb3f 100644 --- a/coreblas/compute/core_zttmqr.c +++ b/coreblas/compute/core_zttmqr.c @@ -6,6 +6,7 @@ * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. + * *** * * @brief Chameleon core_zttmqr CPU kernel @@ -206,11 +207,13 @@ int CORE_zttmqr(cham_side_t side, cham_trans_t trans, } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans != ChamNoTrans)) - || ((side == ChamRight) && (trans == ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) || + ((side == ChamRight) && (trans == ChamNoTrans)) ) + { i1 = 0; i3 = IB; } @@ -249,5 +252,3 @@ int CORE_zttmqr(cham_side_t side, cham_trans_t trans, } return CHAMELEON_SUCCESS; } - - diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 5fc04a48ee7e0fdab1bd665bbeb2efaf09dd84e3..abd6f7fd8b8cca8dea9fe83fdc1bd745d9859d30 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -21,7 +21,6 @@ #include "cudablas.h" /** - ***************************************************************************** * * @ingroup CUDA_CHAMELEON_Complex64_t * @@ -137,15 +136,16 @@ * @retval <0 if -i, the i-th argument had an illegal value * @retval The recommended LWORK value, if LWORK == -1 on entry. */ +/* This kernel is never traced so return type on previous line for convert2eztrace.pl script */ int CUDA_zparfb( cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) @@ -161,7 +161,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cuDoubleComplex *workW, *workC, *workV; cublasStatus_t rc; cudaStream_t stream; - int j, ldW, ldC, ldV; + int i, j, ldW, ldC, ldV; cham_trans_t transW, transA2; int wssize = 0; int wrsize = 0; @@ -213,6 +213,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return -20; } + /* Quick return */ if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) { return CHAMELEON_SUCCESS; } @@ -266,10 +267,11 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, M2 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); - for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j, - 0, - (L - j) * sizeof(cuDoubleComplex), + j = 0; + i = M2 - L + 1; + for(; (i < M2) && (j < K); i++, j++ ) { + cudaMemsetAsync( workV + j * ldV + i, 0, + (M2 - i) * sizeof(cuDoubleComplex), stream ); } } @@ -277,7 +279,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ldV = K; /* - * Backup V, and put 0 in the lower part + * Backup V, and put 0 in the upper part */ cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), V, LDV * sizeof(cuDoubleComplex), @@ -285,8 +287,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cudaMemcpyDeviceToDevice, stream ); for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + ldV * ( M2 - L + j ), - 0, + cudaMemsetAsync( workV + ldV * ( M2 - L + j ), 0, j * sizeof(cuDoubleComplex), stream ); } @@ -414,10 +415,11 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, N2 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); - for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + (j-1) * ldV + N2 - L + j, - 0, - (L - j) * sizeof(cuDoubleComplex), + j = 0; + i = N2 - L + 1; + for(; (i < N2) && (j < K); i++, j++ ) { + cudaMemsetAsync( workV + j * ldV + i, 0, + (N2 - i) * sizeof(cuDoubleComplex), stream ); } } @@ -433,8 +435,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, cudaMemcpyDeviceToDevice, stream ); for(j = 1; j < K; j++) { - cudaMemsetAsync( workV + ldV * ( N2 - L + j ), - 0, + cudaMemsetAsync( workV + ldV * ( N2 - L + j ), 0, j * sizeof(cuDoubleComplex), stream ); } @@ -524,6 +525,5 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return CHAMELEON_ERR_NOT_SUPPORTED; } - (void)L; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c index c46a589810172fe52e30cdafa1aaab50851a7e85..845fc38d38a398da9f3cb9aa27f983218e3159f7 100644 --- a/cudablas/compute/cuda_zttmlq.c +++ b/cudablas/compute/cuda_zttmlq.c @@ -55,12 +55,12 @@ CUDA_zttmlq( cham_side_t side, cham_trans_t trans, if (N1 < 0) { return -4; } - if ( (M2 < 0) || - ( (M2 != M1) && (side == ChamRight) ) ){ + if ((M2 < 0) || + ( (side == ChamRight) && (M1 != M2) ) ) { return -5; } - if ( (N2 < 0) || - ( (N2 != N1) && (side == ChamLeft) ) ){ + if ((N2 < 0) || + ( (side == ChamLeft) && (N1 != N2) ) ) { return -6; } if ((K < 0) || diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c index e18466a0ed3bbcf797affb5d4113a886ddb79b79..23238e79248ead40c80a3157fd69f46ba3f0617e 100644 --- a/cudablas/compute/cuda_zttmqr.c +++ b/cudablas/compute/cuda_zttmqr.c @@ -32,7 +32,7 @@ CUDA_zttmqr( cham_side_t side, cham_trans_t trans, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ) { - int i, i1, i3; + int i, i1, i3; int NQ; int kb, l; int ic = 0; @@ -131,10 +131,10 @@ CUDA_zttmqr( cham_side_t side, cham_trans_t trans, CUDA_zparfb( side, trans, ChamDirForward, ChamColumnwise, mi1, ni1, mi2, ni2, kb, l, - A1 + LDA1*jc+ic, LDA1, + A1 + LDA1 * jc + ic, LDA1, A2, LDA2, - V + LDV*i, LDV, - T + LDT*i, LDT, + V + LDV * i, LDV, + T + LDT * i, LDT, WORK, LWORK, handle ); } return CHAMELEON_SUCCESS;