diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index d9859a604616b7b4c8e79f7bc8a32238b68eca40..4a1f8559a9c62bbad84f79c545c89075dd71229d 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -19,7 +19,7 @@ # # @version 1.0.0 # @author Florent Pruvost -# @date 2015-09-16 +# @date 2018-11-09 # ### @@ -38,11 +38,13 @@ set(ZSRC cuda_zsymm.c cuda_zsyr2k.c cuda_zsyrk.c + cuda_ztpmlqt.c cuda_ztpmqrt.c cuda_ztrmm.c cuda_ztrsm.c cuda_ztsmlq.c cuda_ztsmqr.c + cuda_zttmlq.c cuda_zttmqr.c cuda_zunmlqt.c cuda_zunmqrt.c diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index b44b22ca9225b87dae0bc7cacf0de17050a6a2ef..51fb0f3c70c33d84ea01d9499f51eeac053a0791 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -15,21 +15,21 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int -CUDA_zlarfb(cham_side_t side, cham_trans_t trans, - cham_dir_t direct, cham_store_t storev, - int M, int N, int K, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zlarfb( cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, + int M, int N, int K, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + CUBLAS_STREAM_PARAM ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, } /* Quick return */ - if ((M == 0) || (N == 0) || (K == 0)) + if ((M == 0) || (N == 0) || (K == 0)) { return CHAMELEON_SUCCESS; + } // opposite of trans - if (trans == ChamNoTrans) + if (trans == ChamNoTrans) { transT = ChamConjTrans; - else + } + else { transT = ChamNoTrans; + } // whether T is upper or lower triangular - if (direct == ChamDirForward) + if (direct == ChamDirForward) { uplo = ChamUpper; - else + } + else { uplo = ChamLower; + } if (storev == ChamColumnwise) { notransV = ChamNoTrans; @@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, // W = W T^H = C^H V T^H CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit, N, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + WORK, LDWORK, CUBLAS_STREAM_VALUE ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C @@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans, // W = W T = C V T CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit, M, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + WORK, LDWORK, CUBLAS_STREAM_VALUE ); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 292ac3b647cebbee27d15f7336ed80a21c8de9b0..f644fcaeb0c188c9dd3f9caf8e1d499657bb28b5 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -120,40 +120,32 @@ * The leading dimension of the array T. LDT >= K. * * @param[in,out] WORK - * Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K - * otherwise. + * Workspace of dimension at least: + * - K * (M2 + N2). + * If L > 0, it is recommended to extend it to + * - K * (2 * M2 + N2 ) if side == ChamLeft. + * - K * (M2 + 2 * N2 ) if side == ChamRight. * - * @param[in] LDWORK - * The leading dimension of the array WORK: LDWORK >= K, if side == - * ChamLeft, LDWORK >= M1 otehrwise. - * - * @param[in,out] WORKC - * Optionnal additional workspace to replace the TRMM operation by a GEMM kernel. - * This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2 - * otherwise. - * - * @param[in] LDWORKC - * The leading dimension of the array WORKC: LDWORKC >= M2, if side == - * ChamLeft, LDWORK >= K otehrwise. + * @param[in] LWORK + * The dimension of the array WORK. If LWORK < 0, returns immediately + * the recommended workspace size. * ******************************************************************************* * - * @return - * \retval CHAMELEON_SUCCESS successful exit - * \retval <0 if -i, the i-th argument had an illegal value - * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * @retval The recommended LWORK value, if LWORK == -1 on entry. */ int -CUDA_zparfb(cham_side_t side, cham_trans_t trans, - cham_dir_t direct, cham_store_t storev, - int M1, int N1, int M2, int N2, int K, int L, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM ) +CUDA_zparfb( cham_side_t side, cham_trans_t trans, + cham_dir_t direct, cham_store_t storev, + int M1, int N1, int M2, int N2, int K, int L, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, double mzone = -1.0; #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ + cuDoubleComplex *workW, *workC, *workV; + int ldW, ldC, ldV; int j; cham_trans_t transW; cham_trans_t transA2; + int wssize = 0; + int wrsize = 0; CUBLAS_GET_STREAM; @@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, if (K < 0) { return -9; } - if ( ((LDWORK < K ) && (side == ChamLeft )) || - ((LDWORK < M1) && (side == ChamRight)) ) { + + if (direct == ChamDirForward) { + wssize = K * (M2 + N2); + wrsize = wssize; + if ( L > 0 ) { + wrsize += (side == ChamLeft) ? M2 * K : K * N2; + } + } + + if ( LWORK < 0 ) { + return wrsize; + } + else if ( LWORK < wssize ) { + cudablas_error(20, "Illegal value of LWORK"); return -20; } - /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) { return CHAMELEON_SUCCESS; + } if (direct == ChamDirForward) { if (side == ChamLeft) { - /* * Column or Rowwise / Forward / Left * ---------------------------------- @@ -222,76 +229,116 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, * ( A2 ) */ + /* + * Store in WORK (N1 == N2): + * - Workspace W for the copy of A1 + V' * A2 (K x N1) + * - Workspace C for the copy of V * T (M2 x K ) + * - Workspace V for the copy of V (M2 x K ) + */ + workW = WORK; + ldW = K; + + workC = workW + K * N1; + ldC = M2; + + if ( L == 0 ) { + workV = (cuDoubleComplex*)V; + ldV = LDV; + } + else { + if ( LWORK < wrsize ) { + workC = NULL; + workV = workW + K * N1; + } + else { + workV = workC + M2 * K; + } + ldV = M2; + + /* + * Backup V, and put 0 in the lower part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + M2 * sizeof(cuDoubleComplex), K, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j, + 0., + (L - j) * sizeof(cuDoubleComplex), + stream ); + } + } + /* * W = A1 + V' * A2: * W = A1 * W = W + V' * A2 * */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), + cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), K * sizeof(cuDoubleComplex), N1, cudaMemcpyDeviceToDevice, stream ); transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), - K, N1, M2, - CUBLAS_SADDR(zone), - V /* K*M2 */, LDV, - A2 /* M2*N1 */, LDA2, - CUBLAS_SADDR(zone), - WORK /* K*N1 */, LDWORK); - - if (WORKC == NULL) { + cublasZgemm( CUBLAS_HANDLE + chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), + K, N1, M2, + CUBLAS_SADDR(zone), workV /* M2*K */, ldV, + A2 /* M2*N2 */, LDA2, + CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + + if ( workC == NULL ) { /* W = op(T) * W */ CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit, K, N2, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + workW, ldW, CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { - cublasZaxpy(CUBLAS_HANDLE - K, CUBLAS_SADDR(mzone), - (WORK + LDWORK*j), 1, - (A1 + LDA1*j), 1); + cublasZaxpy( CUBLAS_HANDLE + K, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); } /* A2 = A2 - op(V) * W */ - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), V /* M2*K */, LDV, - WORK /* K*N2 */, LDWORK, - CUBLAS_SADDR(zone), A2 /* m2*N2 */, LDA2); + cublasZgemm( CUBLAS_HANDLE + chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), V /* M2 * K */, LDV, + workW /* K * N2 */, ldW, + CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); } else { /* Wc = V * op(T) */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(transA2), chameleon_cublas_const(trans), M2, K, K, - CUBLAS_SADDR(zone), V, LDV, - T, LDT, - CUBLAS_SADDR(zzero), WORKC, LDWORKC ); + CUBLAS_SADDR(zone), workV, ldV, + T, LDT, + CUBLAS_SADDR(zzero), workC, ldC ); /* A1 = A1 - opt(T) * W */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), K, N1, K, - CUBLAS_SADDR(mzone), T, LDT, - WORK, LDWORK, - CUBLAS_SADDR(zone), A1, LDA1 ); + CUBLAS_SADDR(mzone), T, LDT, + workW, ldW, + CUBLAS_SADDR(zone), A1, LDA1 ); /* A2 = A2 - Wc * W */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), M2, N2, K, - CUBLAS_SADDR(mzone), WORKC, LDWORKC, - WORK, LDWORK, + CUBLAS_SADDR(mzone), workC, ldC, + workW, ldW, CUBLAS_SADDR(zone), A2, LDA2 ); } } @@ -304,14 +351,56 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, * */ + /* + * Store in WORK (M1 == M2): + * - Workspace W for the copy of A1 + A2 * V' (M1 x K ) + * - Workspace C for the copy of V * T (K x N2) + * - Workspace V for the copy of V (K x N2) + */ + workW = WORK; + ldW = M1; + + workC = workW + M1 * K; + ldC = K; + + if ( L == 0 ) { + workV = (cuDoubleComplex*)V; + ldV = LDV; + } + else { + if ( LWORK < wrsize ) { + workC = NULL; + workV = workW + M2 * K; + } + else { + workV = workC + K * N2; + } + ldV = K; + + /* + * Backup V, and put 0 in the upper part + */ + cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex), + V, LDV * sizeof(cuDoubleComplex), + K * sizeof(cuDoubleComplex), N2, + cudaMemcpyDeviceToDevice, stream ); + + for(j = 1; j < K; j++) { + cudaMemsetAsync( workV + ldV + N2 - L + j, + 0., + j * sizeof(cuDoubleComplex), + stream ); + } + } + /* * W = A1 + A2 * V': * W = A1 * W = W + A2 * V' * */ - cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex), - A1, LDA1 * sizeof(cuDoubleComplex), + cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex), + A1, LDA1 * sizeof(cuDoubleComplex), M1 * sizeof(cuDoubleComplex), K, cudaMemcpyDeviceToDevice, stream ); @@ -321,40 +410,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, cublasZgemm(CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), M1, K, N2, - CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, - V /* N2*K */, LDV, - CUBLAS_SADDR(zone), WORK /* M1*K */, LDWORK); + CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), workW /* M1*K */, ldW); - if (WORKC == NULL) { + if ( workC == NULL ) { /* W = W * op(T) */ CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit, M2, K, - CUBLAS_SADDR(zone), T, LDT, - WORK, LDWORK, + &zone, T, LDT, + workW, ldW, CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - W * op(T) */ for(j = 0; j < K; j++) { - cublasZaxpy(CUBLAS_HANDLE - M1, CUBLAS_SADDR(mzone), - (WORK + LDWORK*j), 1, - (A1 + LDA1*j), 1); + cublasZaxpy( CUBLAS_HANDLE + M1, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); } /* A2 = A2 - W * op(V) */ cublasZgemm(CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), M2, N2, K, - CUBLAS_SADDR(mzone), WORK /* M2*K */, LDWORK, - V /* K*N2 */, LDV, - CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, + V /* K *N2 */, LDV, + CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); } else { /* A1 = A1 - W * opt(T) */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), M1, K, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, + CUBLAS_SADDR(mzone), workW, ldW, T, LDT, CUBLAS_SADDR(zone), A1, LDA1 ); @@ -363,15 +452,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans, chameleon_cublas_const(trans), chameleon_cublas_const(transA2), K, N2, K, CUBLAS_SADDR(zone), T, LDT, - V, LDV, - CUBLAS_SADDR(zzero), WORKC, LDWORKC ); + workV, ldV, + CUBLAS_SADDR(zzero), workC, ldC ); /* A2 = A2 - W * Wc */ cublasZgemm( CUBLAS_HANDLE chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), M2, N2, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, - WORKC, LDWORKC, + CUBLAS_SADDR(mzone), workW, ldW, + workC, ldC, CUBLAS_SADDR(zone), A2, LDA2 ); } } diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c new file mode 100644 index 0000000000000000000000000000000000000000..4f01e0e28cf34690cc3e3daa1823c418e9557fac --- /dev/null +++ b/cudablas/compute/cuda_ztpmlqt.c @@ -0,0 +1,184 @@ +/** + * + * @file cuda_ztpmlqt.c + * + * @copyright 2009-2016 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_ztpmlqt GPU kernel + * + * @version 1.0.0 + * @author Mathieu Faverge + * @date 2018-11-09 + * @precisions normal z -> c d s + * + */ +#include "cudablas.h" + +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Applies a complex orthogonal matrix Q. + * + * The matrix Q is obtained from a "triangular-pentagonal" complex block + * reflector H to a general complex matrix C, which consists of two blocks A and + * B. + * + ******************************************************************************* + * + * @param[in] side + * @arg ChamLeft : apply Q or Q**H from the Left; + * @arg ChamRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg ChamNoTrans : No transpose, apply Q; + * @arg ChamConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] M + * The number of rows of the tile B. M >= 0. + * + * @param[in] N + * The number of columns of the tile B. N >= 0. + * + * @param[in] K + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] L + * The number of rows of the upper trapezoidal part of V. + * K >= L >= 0. See Further Details. + * + * @param[in] IB + * The inner-blocking size. IB >= 0. + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTPQRT in the first k rows of its array argument V. + * + * @param[in] LDV + * The leading dimension of the array V. LDV >= max(1,K). + * + * @param[in] T + * The IB-by-N1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] LDT + * The leading dimension of the array T. LDT >= IB. + * + * @param[in,out] A + * A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft + * or (LDA,K) if SIDE = ChamRight + * On entry, the K-by-N or M-by-K matrix A. + * On exit, A is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * If side = ChamLeft, LDA >= max(1,K); + * If side = Chamright, LDA >= max(1,M). + * + * @param[in,out] B + * On entry, the M-by-N tile B. + * On exit, B is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDB + * The leading dimension of the tile B. LDB >= max(1,M). + * + * @param[out] WORK + * Workspace array of size LDWORK-by-NB. + * LDWORK = N if side = ChamLeft, or M if side = ChamRight. + * + ******************************************************************************* + * + * @par Further Details: + * ===================== + * + * The columns of the pentagonal matrix V contain the elementary reflectors + * H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a + * trapezoidal block V2: + * + * V = [V1] [V2]. + * + * The size of the trapezoidal block V2 is determined by the parameter L, + * where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L + * rows of a K-by-K upper triangular matrix. If L=K, V2 is lower triangular; + * if L=0, there is no trapezoidal block, hence V = V1 is rectangular. + * + * If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is K-by-M. + * [B] + * + * If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is K-by-N. + * + * The complex orthogonal matrix Q is formed from V and T. + * + * If trans='N' and side='L', C is on exit replaced with Q * C. + * + * If trans='C' and side='L', C is on exit replaced with Q**H * C. + * + * If trans='N' and side='R', C is on exit replaced with C * Q. + * + * If trans='C' and side='R', C is on exit replaced with C * Q**H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ +int +CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int L, int IB, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *A, int LDA, + cuDoubleComplex *B, int LDB, + cuDoubleComplex *WORK, int lwork, + CUBLAS_STREAM_PARAM ) +{ + int m1, n1; + + /* Check input arguments */ + if ((side != ChamLeft) && (side != ChamRight)) { + cudablas_error(1, "Illegal value of side"); + return -1; + } + + if ( side == ChamLeft ) { + m1 = K; + n1 = N; + } + else { + m1 = M; + n1 = K; + } + + /* TS case */ + if (L == 0) { + CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB, + A, LDA, B, LDB, V, LDV, T, LDT, + WORK, lwork, + CUBLAS_STREAM_VALUE ); + } + /* TT case */ + else if( L == N ) { + CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB, + A, LDA, B, LDB, V, LDV, T, LDT, + WORK, lwork, + CUBLAS_STREAM_VALUE ); + } + else { + cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" ); + return -6; + } + + return CHAMELEON_SUCCESS; +} diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c index 2719edd32b1fef478dd150f67b89ba0c8dc465e2..c7a19fd76726491b982c0da9be7c0ff444fd829c 100644 --- a/cudablas/compute/cuda_ztpmqrt.c +++ b/cudablas/compute/cuda_ztpmqrt.c @@ -13,12 +13,128 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" +/** + ******************************************************************************* + * + * @ingroup CORE_CHAMELEON_Complex64_t + * + * @brief Applies a complex orthogonal matrix Q. + * + * The matrix Q is obtained from a "triangular-pentagonal" complex block + * reflector H to a general complex matrix C, which consists of two blocks A and + * B. + * + ******************************************************************************* + * + * @param[in] side + * @arg ChamLeft : apply Q or Q**H from the Left; + * @arg ChamRight : apply Q or Q**H from the Right. + * + * @param[in] trans + * @arg ChamNoTrans : No transpose, apply Q; + * @arg ChamConjTrans : ConjTranspose, apply Q**H. + * + * @param[in] M + * The number of rows of the tile B. M >= 0. + * + * @param[in] N + * The number of columns of the tile B. N >= 0. + * + * @param[in] K + * The number of elementary reflectors whose product defines + * the matrix Q. + * + * @param[in] L + * The number of rows of the upper trapezoidal part of V. + * K >= L >= 0. See Further Details. + * + * @param[in] IB + * The inner-blocking size. IB >= 0. + * + * @param[in] V + * The i-th row must contain the vector which defines the + * elementary reflector H(i), for i = 1,2,...,k, as returned by + * CORE_ZTPQRT in the first k rows of its array argument V. + * + * @param[in] LDV + * The leading dimension of the array V. LDV >= max(1,K). + * + * @param[in] T + * The IB-by-N1 triangular factor T of the block reflector. + * T is upper triangular by block (economic storage); + * The rest of the array is not referenced. + * + * @param[in] LDT + * The leading dimension of the array T. LDT >= IB. + * + * @param[in,out] A + * A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft + * or (LDA,K) if SIDE = ChamRight + * On entry, the K-by-N or M-by-K matrix A. + * On exit, A is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDA + * The leading dimension of the array A. LDA >= max(1,M). + * If side = ChamLeft, LDA >= max(1,K); + * If side = Chamright, LDA >= max(1,M). + * + * @param[in,out] B + * On entry, the M-by-N tile B. + * On exit, B is overwritten by the corresponding block of + * Q*C or Q**H*C or C*Q or C*Q**H. See Further Details. + * + * @param[in] LDB + * The leading dimension of the tile B. LDB >= max(1,M). + * + * @param[out] WORK + * Workspace array of size LDWORK-by-NB. + * LDWORK = N if side = ChamLeft, or M if side = ChamRight. + * + ******************************************************************************* + * + * @par Further Details: + * ===================== + * + * The columns of the pentagonal matrix V contain the elementary reflectors + * H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a + * trapezoidal block V2: + * + * V = [V1] + * [V2]. + * + * The size of the trapezoidal block V2 is determined by the parameter L, + * where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L + * rows of a K-by-K upper triangular matrix. If L=K, V2 is upper triangular; + * if L=0, there is no trapezoidal block, hence V = V1 is rectangular. + * + * If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is M-by-K. + * [B] + * + * If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is N-by-K. + * + * The complex orthogonal matrix Q is formed from V and T. + * + * If trans='N' and side='L', C is on exit replaced with Q * C. + * + * If trans='C' and side='L', C is on exit replaced with Q**H * C. + * + * If trans='N' and side='R', C is on exit replaced with C * Q. + * + * If trans='C' and side='R', C is on exit replaced with C * Q**H. + * + ******************************************************************************* + * + * @retval CHAMELEON_SUCCESS successful exit + * @retval <0 if -i, the i-th argument had an illegal value + * + */ int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, @@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, - cuDoubleComplex *WORK, + cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ) { - int m1, n1, ldwork, ldworkc, ws; + int m1, n1; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, if ( side == ChamLeft ) { m1 = K; n1 = N; - ldwork = IB; - ldworkc = M; - ws = ldwork * n1; } else { m1 = M; n1 = K; - ldwork = chameleon_max( K, chameleon_max( M, N ) ); - ldworkc = IB; - ws = ldwork * IB; } /* TS case */ if (L == 0) { CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, - WORK, ldwork, WORK + ws, ldworkc, + WORK, lwork, CUBLAS_STREAM_VALUE ); } /* TT case */ else if( L == M ) { CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, - WORK, ldwork, WORK + ws, ldworkc, + WORK, lwork, CUBLAS_STREAM_VALUE ); } else { diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c index 7d2e3c4a3fbc19d5ec319991a9d56a537608016b..fa949538f7782eb169a3ceb5fe237f4dc046fe05 100644 --- a/cudablas/compute/cuda_ztsmlq.c +++ b/cudablas/compute/cuda_ztsmlq.c @@ -13,24 +13,24 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @author Mathieu Faverge + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int CUDA_ztsmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM) + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) { int i, i1, i3; int NW; @@ -90,21 +90,20 @@ int CUDA_ztsmlq( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans == ChamNoTrans)) - || ((side == ChamRight) && (trans != ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) || + ((side == ChamRight) && (trans != ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } @@ -115,7 +114,7 @@ int CUDA_ztsmlq( trans = ChamNoTrans; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -137,13 +136,13 @@ int CUDA_ztsmlq( * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) */ CUDA_zparfb( - side, trans, ChamDirForward, ChamRowwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + side, trans, ChamDirForward, ChamRowwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + i, LDV, + T + LDT*i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c index 4b07b9b9ce16397eabf8e358d81b7499eeab2c39..6a655a5401d1f1f7830afcbd19d4e2579fb48fa6 100644 --- a/cudablas/compute/cuda_ztsmqr.c +++ b/cudablas/compute/cuda_ztsmqr.c @@ -13,24 +13,24 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @author Mathieu Faverge + * @date 2018-11-09 * @precisions normal z -> c d s * */ #include "cudablas.h" int CUDA_ztsmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, - CUBLAS_STREAM_PARAM) + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) { int i, i1, i3; int NQ, NW; @@ -92,25 +92,24 @@ int CUDA_ztsmqr( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans != ChamNoTrans)) - || ((side == ChamRight) && (trans == ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) || + ((side == ChamRight) && (trans == ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -127,17 +126,18 @@ int CUDA_ztsmqr( ni = N1 - i; jc = i; } + /* * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb) */ CUDA_zparfb( - side, trans, ChamDirForward, ChamColumnwise, - mi, ni, M2, N2, kb, 0, - A1 + LDA1*jc+ic, LDA1, - A2, LDA2, - V + LDV*i, LDV, - T + LDT*i, LDT, - WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + side, trans, ChamDirForward, ChamColumnwise, + mi, ni, M2, N2, kb, 0, + A1 + LDA1*jc+ic, LDA1, + A2, LDA2, + V + LDV*i, LDV, + T + LDT*i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c new file mode 100644 index 0000000000000000000000000000000000000000..ddde1ed384df7bcb5ce7360de93bdfc3aaa926e2 --- /dev/null +++ b/cudablas/compute/cuda_zttmlq.c @@ -0,0 +1,149 @@ +/** + * + * @file cuda_zttmlq.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_zttmlq GPU kernel + * + * @version 1.0.0 + * @author Florent Pruvost + * @author Mathieu Faverge + * @date 2018-11-09 + * @precisions normal z -> c d s + * + */ +#include "cudablas.h" + +int CUDA_zttmlq( + cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + CUBLAS_STREAM_PARAM) +{ + int i, i1, i3; + int NW; + int kb, l; + int ic = 0; + int jc = 0; + int mi1 = M1; + int mi2 = M2; + int ni1 = N1; + int ni2 = N2; + + /* Check input arguments */ + if ((side != ChamLeft) && (side != ChamRight)) { + return -1; + } + + /* NQ is the order of Q */ + if (side == ChamLeft) { + NW = IB; + } + else { + NW = N1; + } + + if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) { + return -2; + } + if (M1 < 0) { + return -3; + } + if (N1 < 0) { + return -4; + } + if ( (M2 < 0) || + ( (M2 != M1) && (side == ChamRight) ) ){ + return -5; + } + if ( (N2 < 0) || + ( (N2 != N1) && (side == ChamLeft) ) ){ + return -6; + } + if ((K < 0) || + ( (side == ChamLeft) && (K > M1) ) || + ( (side == ChamRight) && (K > N1) ) ) { + return -7; + } + if (IB < 0) { + return -8; + } + if (LDA1 < chameleon_max(1,M1)){ + return -10; + } + if (LDA2 < chameleon_max(1,M2)){ + return -12; + } + if (LDV < chameleon_max(1,K)){ + return -14; + } + if (LDT < chameleon_max(1,IB)){ + return -16; + } + + /* Quick return */ + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { + return CHAMELEON_SUCCESS; + } + + if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) || + ((side == ChamRight) && (trans != ChamNoTrans)) ) + { + i1 = 0; + i3 = IB; + } + else { + i1 = ( ( K-1 ) / IB )*IB; + i3 = -IB; + } + + /* Transpose */ + if (trans == ChamNoTrans) { + trans = ChamConjTrans; + } + else { + trans = ChamNoTrans; + } + + for (i = i1; (i > -1) && (i < K); i+=i3) { + kb = chameleon_min(IB, K-i); + + if (side == ChamLeft) { + mi1 = kb; + mi2 = chameleon_min(i+kb, M2); + l = chameleon_min(kb, chameleon_max(0, M2-i)); + ic = i; + } + else { + ni1 = kb; + ni2 = chameleon_min(i+kb, N2); + l = chameleon_min(kb, chameleon_max(0, N2-i)); + jc = i; + } + + /* + * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb) + */ + CUDA_zparfb( + side, trans, ChamDirForward, ChamRowwise, + mi1, ni1, mi2, ni2, kb, l, + A1 + LDA1 * jc + ic, LDA1, + A2, LDA2, + V + i, LDV, + T + LDT * i, LDT, + WORK, LWORK, CUBLAS_STREAM_VALUE ); + } + return CHAMELEON_SUCCESS; +} diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c index 236405cdf080fd3c7941aa62426a70cde2e5d8f7..e2a91ee817915b5453d79bfc987e0345ecab0022 100644 --- a/cudablas/compute/cuda_zttmqr.c +++ b/cudablas/compute/cuda_zttmqr.c @@ -6,6 +6,7 @@ * Tennessee Research Foundation. All rights reserved. * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, * Univ. Bordeaux. All rights reserved. + * *** * * @brief Chameleon cuda_zttmqr GPU kernel @@ -13,7 +14,7 @@ * @version 1.0.0 * @author Florent Pruvost * @author Mathieu Faverge - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -28,13 +29,12 @@ int CUDA_zttmqr( cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LDWORK, - cuDoubleComplex *WORKC, int LDWORKC, + cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM) { - int i, i1, i3, l; + int i, i1, i3; int NQ, NW; - int kb; + int kb, l; int ic = 0; int jc = 0; int mi1 = M1; @@ -94,25 +94,24 @@ int CUDA_zttmqr( if (LDT < chameleon_max(1,IB)){ return -16; } - if (LDWORK < chameleon_max(1,NW)){ - return -18; - } /* Quick return */ - if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) + if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) { return CHAMELEON_SUCCESS; + } - if (((side == ChamLeft) && (trans != ChamNoTrans)) - || ((side == ChamRight) && (trans == ChamNoTrans))) { + if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) || + ((side == ChamRight) && (trans == ChamNoTrans)) ) + { i1 = 0; i3 = IB; } else { - i1 = ((K-1) / IB)*IB; + i1 = ( ( K-1 ) / IB )*IB; i3 = -IB; } - for(i = i1; (i > -1) && (i < K); i += i3) { + for (i = i1; (i > -1) && (i < K); i+=i3) { kb = chameleon_min(IB, K-i); if (side == ChamLeft) { @@ -138,8 +137,7 @@ int CUDA_zttmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LDWORK, - WORKC, LDWORKC, CUBLAS_STREAM_VALUE ); + WORK, LWORK, CUBLAS_STREAM_VALUE ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h index 8e96d463ca451c74b17f91c22c2b6c03a27a101e..8895ff6485b33c423bd80b5b99599883eec58ce7 100644 --- a/cudablas/include/cudablas/cudablas_z.h +++ b/cudablas/include/cudablas/cudablas_z.h @@ -13,7 +13,7 @@ * * @version 1.0.0 * @author Florent Pruvost - * @date 2015-09-16 + * @date 2018-11-09 * @precisions normal z -> c d s * */ @@ -31,16 +31,18 @@ int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleCom int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); int CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM ); +int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); +int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); int CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); int CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); -int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );