Mentions légales du service

Skip to content
Snippets Groups Projects
Commit f343a4fa authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Add missing tpmlqt kernel for GPU, and fix the parfb kernel to handle the TT cases

parent 5ed9ade8
No related branches found
No related tags found
1 merge request!129Fix #56 - Issue in CUDA_zparfb kernels for TT case
......@@ -19,7 +19,7 @@
#
# @version 1.0.0
# @author Florent Pruvost
# @date 2015-09-16
# @date 2018-11-09
#
###
......@@ -38,11 +38,13 @@ set(ZSRC
cuda_zsymm.c
cuda_zsyr2k.c
cuda_zsyrk.c
cuda_ztpmlqt.c
cuda_ztpmqrt.c
cuda_ztrmm.c
cuda_ztrsm.c
cuda_ztsmlq.c
cuda_ztsmqr.c
cuda_zttmlq.c
cuda_zttmqr.c
cuda_zunmlqt.c
cuda_zunmqrt.c
......
......@@ -15,21 +15,21 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int
CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
......@@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
}
/* Quick return */
if ((M == 0) || (N == 0) || (K == 0))
if ((M == 0) || (N == 0) || (K == 0)) {
return CHAMELEON_SUCCESS;
}
// opposite of trans
if (trans == ChamNoTrans)
if (trans == ChamNoTrans) {
transT = ChamConjTrans;
else
}
else {
transT = ChamNoTrans;
}
// whether T is upper or lower triangular
if (direct == ChamDirForward)
if (direct == ChamDirForward) {
uplo = ChamUpper;
else
}
else {
uplo = ChamLower;
}
if (storev == ChamColumnwise) {
notransV = ChamNoTrans;
......@@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T^H = C^H V T^H
CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
N, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
......@@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T = C V T
CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
M, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
......
......@@ -13,7 +13,7 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
......@@ -120,40 +120,32 @@
* The leading dimension of the array T. LDT >= K.
*
* @param[in,out] WORK
* Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K
* otherwise.
* Workspace of dimension at least:
* - K * (M2 + N2).
* If L > 0, it is recommended to extend it to
* - K * (2 * M2 + N2 ) if side == ChamLeft.
* - K * (M2 + 2 * N2 ) if side == ChamRight.
*
* @param[in] LDWORK
* The leading dimension of the array WORK: LDWORK >= K, if side ==
* ChamLeft, LDWORK >= M1 otehrwise.
*
* @param[in,out] WORKC
* Optionnal additional workspace to replace the TRMM operation by a GEMM kernel.
* This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2
* otherwise.
*
* @param[in] LDWORKC
* The leading dimension of the array WORKC: LDWORKC >= M2, if side ==
* ChamLeft, LDWORK >= K otehrwise.
* @param[in] LWORK
* The dimension of the array WORK. If LWORK < 0, returns immediately
* the recommended workspace size.
*
*******************************************************************************
*
* @return
* \retval CHAMELEON_SUCCESS successful exit
* \retval <0 if -i, the i-th argument had an illegal value
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
* @retval The recommended LWORK value, if LWORK == -1 on entry.
*/
int
CUDA_zparfb(cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M1, int N1, int M2, int N2, int K, int L,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM )
CUDA_zparfb( cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M1, int N1, int M2, int N2, int K, int L,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM )
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
......@@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
double mzone = -1.0;
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
cuDoubleComplex *workW, *workC, *workV;
int ldW, ldC, ldV;
int j;
cham_trans_t transW;
cham_trans_t transA2;
int wssize = 0;
int wrsize = 0;
CUBLAS_GET_STREAM;
......@@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
if (K < 0) {
return -9;
}
if ( ((LDWORK < K ) && (side == ChamLeft )) ||
((LDWORK < M1) && (side == ChamRight)) ) {
if (direct == ChamDirForward) {
wssize = K * (M2 + N2);
wrsize = wssize;
if ( L > 0 ) {
wrsize += (side == ChamLeft) ? M2 * K : K * N2;
}
}
if ( LWORK < 0 ) {
return wrsize;
}
else if ( LWORK < wssize ) {
cudablas_error(20, "Illegal value of LWORK");
return -20;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) {
return CHAMELEON_SUCCESS;
}
if (direct == ChamDirForward) {
if (side == ChamLeft) {
/*
* Column or Rowwise / Forward / Left
* ----------------------------------
......@@ -222,76 +229,116 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
* ( A2 )
*/
/*
* Store in WORK (N1 == N2):
* - Workspace W for the copy of A1 + V' * A2 (K x N1)
* - Workspace C for the copy of V * T (M2 x K )
* - Workspace V for the copy of V (M2 x K )
*/
workW = WORK;
ldW = K;
workC = workW + K * N1;
ldC = M2;
if ( L == 0 ) {
workV = (cuDoubleComplex*)V;
ldV = LDV;
}
else {
if ( LWORK < wrsize ) {
workC = NULL;
workV = workW + K * N1;
}
else {
workV = workC + M2 * K;
}
ldV = M2;
/*
* Backup V, and put 0 in the lower part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
M2 * sizeof(cuDoubleComplex), K,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
0.,
(L - j) * sizeof(cuDoubleComplex),
stream );
}
}
/*
* W = A1 + V' * A2:
* W = A1
* W = W + V' * A2
*
*/
cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), N1,
cudaMemcpyDeviceToDevice, stream );
transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
K, N1, M2,
CUBLAS_SADDR(zone),
V /* K*M2 */, LDV,
A2 /* M2*N1 */, LDA2,
CUBLAS_SADDR(zone),
WORK /* K*N1 */, LDWORK);
if (WORKC == NULL) {
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
K, N1, M2,
CUBLAS_SADDR(zone), workV /* M2*K */, ldV,
A2 /* M2*N2 */, LDA2,
CUBLAS_SADDR(zone), workW /* K *N2 */, ldW );
if ( workC == NULL ) {
/* W = op(T) * W */
CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit,
K, N2,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
workW, ldW,
CUBLAS_STREAM_VALUE );
/* A1 = A1 - W = A1 - op(T) * W */
for(j = 0; j < N1; j++) {
cublasZaxpy(CUBLAS_HANDLE
K, CUBLAS_SADDR(mzone),
(WORK + LDWORK*j), 1,
(A1 + LDA1*j), 1);
cublasZaxpy( CUBLAS_HANDLE
K, CUBLAS_SADDR(mzone),
workW + ldW * j, 1,
A1 + LDA1 * j, 1 );
}
/* A2 = A2 - op(V) * W */
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), V /* M2*K */, LDV,
WORK /* K*N2 */, LDWORK,
CUBLAS_SADDR(zone), A2 /* m2*N2 */, LDA2);
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), V /* M2 * K */, LDV,
workW /* K * N2 */, ldW,
CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 );
} else {
/* Wc = V * op(T) */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(trans),
M2, K, K,
CUBLAS_SADDR(zone), V, LDV,
T, LDT,
CUBLAS_SADDR(zzero), WORKC, LDWORKC );
CUBLAS_SADDR(zone), workV, ldV,
T, LDT,
CUBLAS_SADDR(zzero), workC, ldC );
/* A1 = A1 - opt(T) * W */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
K, N1, K,
CUBLAS_SADDR(mzone), T, LDT,
WORK, LDWORK,
CUBLAS_SADDR(zone), A1, LDA1 );
CUBLAS_SADDR(mzone), T, LDT,
workW, ldW,
CUBLAS_SADDR(zone), A1, LDA1 );
/* A2 = A2 - Wc * W */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), WORKC, LDWORKC,
WORK, LDWORK,
CUBLAS_SADDR(mzone), workC, ldC,
workW, ldW,
CUBLAS_SADDR(zone), A2, LDA2 );
}
}
......@@ -304,14 +351,56 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
*
*/
/*
* Store in WORK (M1 == M2):
* - Workspace W for the copy of A1 + A2 * V' (M1 x K )
* - Workspace C for the copy of V * T (K x N2)
* - Workspace V for the copy of V (K x N2)
*/
workW = WORK;
ldW = M1;
workC = workW + M1 * K;
ldC = K;
if ( L == 0 ) {
workV = (cuDoubleComplex*)V;
ldV = LDV;
}
else {
if ( LWORK < wrsize ) {
workC = NULL;
workV = workW + M2 * K;
}
else {
workV = workC + K * N2;
}
ldV = K;
/*
* Backup V, and put 0 in the upper part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), N2,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + ldV + N2 - L + j,
0.,
j * sizeof(cuDoubleComplex),
stream );
}
}
/*
* W = A1 + A2 * V':
* W = A1
* W = W + A2 * V'
*
*/
cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
M1 * sizeof(cuDoubleComplex), K,
cudaMemcpyDeviceToDevice, stream );
......@@ -321,40 +410,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW),
M1, K, N2,
CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2,
V /* N2*K */, LDV,
CUBLAS_SADDR(zone), WORK /* M1*K */, LDWORK);
CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2,
workV /* K *N2 */, ldV,
CUBLAS_SADDR(zone), workW /* M1*K */, ldW);
if (WORKC == NULL) {
if ( workC == NULL ) {
/* W = W * op(T) */
CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit,
M2, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
workW, ldW,
CUBLAS_STREAM_VALUE );
/* A1 = A1 - W = A1 - W * op(T) */
for(j = 0; j < K; j++) {
cublasZaxpy(CUBLAS_HANDLE
M1, CUBLAS_SADDR(mzone),
(WORK + LDWORK*j), 1,
(A1 + LDA1*j), 1);
cublasZaxpy( CUBLAS_HANDLE
M1, CUBLAS_SADDR(mzone),
workW + ldW * j, 1,
A1 + LDA1 * j, 1 );
}
/* A2 = A2 - W * op(V) */
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
M2, N2, K,
CUBLAS_SADDR(mzone), WORK /* M2*K */, LDWORK,
V /* K*N2 */, LDV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
CUBLAS_SADDR(mzone), workW /* M2*K */, ldW,
V /* K *N2 */, LDV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
} else {
/* A1 = A1 - W * opt(T) */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans),
M1, K, K,
CUBLAS_SADDR(mzone), WORK, LDWORK,
CUBLAS_SADDR(mzone), workW, ldW,
T, LDT,
CUBLAS_SADDR(zone), A1, LDA1 );
......@@ -363,15 +452,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
chameleon_cublas_const(trans), chameleon_cublas_const(transA2),
K, N2, K,
CUBLAS_SADDR(zone), T, LDT,
V, LDV,
CUBLAS_SADDR(zzero), WORKC, LDWORKC );
workV, ldV,
CUBLAS_SADDR(zzero), workC, ldC );
/* A2 = A2 - W * Wc */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), WORK, LDWORK,
WORKC, LDWORKC,
CUBLAS_SADDR(mzone), workW, ldW,
workC, ldC,
CUBLAS_SADDR(zone), A2, LDA2 );
}
}
......
/**
*
* @file cuda_ztpmlqt.c
*
* @copyright 2009-2016 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_ztpmlqt GPU kernel
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
/**
*******************************************************************************
*
* @ingroup CORE_CHAMELEON_Complex64_t
*
* @brief Applies a complex orthogonal matrix Q.
*
* The matrix Q is obtained from a "triangular-pentagonal" complex block
* reflector H to a general complex matrix C, which consists of two blocks A and
* B.
*
*******************************************************************************
*
* @param[in] side
* @arg ChamLeft : apply Q or Q**H from the Left;
* @arg ChamRight : apply Q or Q**H from the Right.
*
* @param[in] trans
* @arg ChamNoTrans : No transpose, apply Q;
* @arg ChamConjTrans : ConjTranspose, apply Q**H.
*
* @param[in] M
* The number of rows of the tile B. M >= 0.
*
* @param[in] N
* The number of columns of the tile B. N >= 0.
*
* @param[in] K
* The number of elementary reflectors whose product defines
* the matrix Q.
*
* @param[in] L
* The number of rows of the upper trapezoidal part of V.
* K >= L >= 0. See Further Details.
*
* @param[in] IB
* The inner-blocking size. IB >= 0.
*
* @param[in] V
* The i-th row must contain the vector which defines the
* elementary reflector H(i), for i = 1,2,...,k, as returned by
* CORE_ZTPQRT in the first k rows of its array argument V.
*
* @param[in] LDV
* The leading dimension of the array V. LDV >= max(1,K).
*
* @param[in] T
* The IB-by-N1 triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
* @param[in] LDT
* The leading dimension of the array T. LDT >= IB.
*
* @param[in,out] A
* A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
* or (LDA,K) if SIDE = ChamRight
* On entry, the K-by-N or M-by-K matrix A.
* On exit, A is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDA
* The leading dimension of the array A. LDA >= max(1,M).
* If side = ChamLeft, LDA >= max(1,K);
* If side = Chamright, LDA >= max(1,M).
*
* @param[in,out] B
* On entry, the M-by-N tile B.
* On exit, B is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDB
* The leading dimension of the tile B. LDB >= max(1,M).
*
* @param[out] WORK
* Workspace array of size LDWORK-by-NB.
* LDWORK = N if side = ChamLeft, or M if side = ChamRight.
*
*******************************************************************************
*
* @par Further Details:
* =====================
*
* The columns of the pentagonal matrix V contain the elementary reflectors
* H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
* trapezoidal block V2:
*
* V = [V1] [V2].
*
* The size of the trapezoidal block V2 is determined by the parameter L,
* where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L
* rows of a K-by-K upper triangular matrix. If L=K, V2 is lower triangular;
* if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
*
* If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is K-by-M.
* [B]
*
* If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is K-by-N.
*
* The complex orthogonal matrix Q is formed from V and T.
*
* If trans='N' and side='L', C is on exit replaced with Q * C.
*
* If trans='C' and side='L', C is on exit replaced with Q**H * C.
*
* If trans='N' and side='R', C is on exit replaced with C * Q.
*
* If trans='C' and side='R', C is on exit replaced with C * Q**H.
*
*******************************************************************************
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
*
*/
int
CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int IB,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM )
{
int m1, n1;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
cudablas_error(1, "Illegal value of side");
return -1;
}
if ( side == ChamLeft ) {
m1 = K;
n1 = N;
}
else {
m1 = M;
n1 = K;
}
/* TS case */
if (L == 0) {
CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
/* TT case */
else if( L == N ) {
CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
else {
cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" );
return -6;
}
return CHAMELEON_SUCCESS;
}
......@@ -13,12 +13,128 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
/**
*******************************************************************************
*
* @ingroup CORE_CHAMELEON_Complex64_t
*
* @brief Applies a complex orthogonal matrix Q.
*
* The matrix Q is obtained from a "triangular-pentagonal" complex block
* reflector H to a general complex matrix C, which consists of two blocks A and
* B.
*
*******************************************************************************
*
* @param[in] side
* @arg ChamLeft : apply Q or Q**H from the Left;
* @arg ChamRight : apply Q or Q**H from the Right.
*
* @param[in] trans
* @arg ChamNoTrans : No transpose, apply Q;
* @arg ChamConjTrans : ConjTranspose, apply Q**H.
*
* @param[in] M
* The number of rows of the tile B. M >= 0.
*
* @param[in] N
* The number of columns of the tile B. N >= 0.
*
* @param[in] K
* The number of elementary reflectors whose product defines
* the matrix Q.
*
* @param[in] L
* The number of rows of the upper trapezoidal part of V.
* K >= L >= 0. See Further Details.
*
* @param[in] IB
* The inner-blocking size. IB >= 0.
*
* @param[in] V
* The i-th row must contain the vector which defines the
* elementary reflector H(i), for i = 1,2,...,k, as returned by
* CORE_ZTPQRT in the first k rows of its array argument V.
*
* @param[in] LDV
* The leading dimension of the array V. LDV >= max(1,K).
*
* @param[in] T
* The IB-by-N1 triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
* @param[in] LDT
* The leading dimension of the array T. LDT >= IB.
*
* @param[in,out] A
* A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
* or (LDA,K) if SIDE = ChamRight
* On entry, the K-by-N or M-by-K matrix A.
* On exit, A is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDA
* The leading dimension of the array A. LDA >= max(1,M).
* If side = ChamLeft, LDA >= max(1,K);
* If side = Chamright, LDA >= max(1,M).
*
* @param[in,out] B
* On entry, the M-by-N tile B.
* On exit, B is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDB
* The leading dimension of the tile B. LDB >= max(1,M).
*
* @param[out] WORK
* Workspace array of size LDWORK-by-NB.
* LDWORK = N if side = ChamLeft, or M if side = ChamRight.
*
*******************************************************************************
*
* @par Further Details:
* =====================
*
* The columns of the pentagonal matrix V contain the elementary reflectors
* H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
* trapezoidal block V2:
*
* V = [V1]
* [V2].
*
* The size of the trapezoidal block V2 is determined by the parameter L,
* where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L
* rows of a K-by-K upper triangular matrix. If L=K, V2 is upper triangular;
* if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
*
* If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is M-by-K.
* [B]
*
* If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is N-by-K.
*
* The complex orthogonal matrix Q is formed from V and T.
*
* If trans='N' and side='L', C is on exit replaced with Q * C.
*
* If trans='C' and side='L', C is on exit replaced with Q**H * C.
*
* If trans='N' and side='R', C is on exit replaced with C * Q.
*
* If trans='C' and side='R', C is on exit replaced with C * Q**H.
*
*******************************************************************************
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
*
*/
int
CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int IB,
......@@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK,
cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM )
{
int m1, n1, ldwork, ldworkc, ws;
int m1, n1;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
......@@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
if ( side == ChamLeft ) {
m1 = K;
n1 = N;
ldwork = IB;
ldworkc = M;
ws = ldwork * n1;
}
else {
m1 = M;
n1 = K;
ldwork = chameleon_max( K, chameleon_max( M, N ) );
ldworkc = IB;
ws = ldwork * IB;
}
/* TS case */
if (L == 0) {
CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, ldwork, WORK + ws, ldworkc,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
/* TT case */
else if( L == M ) {
CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, ldwork, WORK + ws, ldworkc,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
else {
......
......@@ -13,24 +13,24 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_ztsmlq(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM)
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NW;
......@@ -90,21 +90,20 @@ int CUDA_ztsmlq(
if (LDT < chameleon_max(1,IB)){
return -16;
}
if (LDWORK < chameleon_max(1,NW)){
return -18;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if (((side == ChamLeft) && (trans == ChamNoTrans))
|| ((side == ChamRight) && (trans != ChamNoTrans))) {
if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
((side == ChamRight) && (trans != ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ((K-1) / IB)*IB;
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
......@@ -115,7 +114,7 @@ int CUDA_ztsmlq(
trans = ChamNoTrans;
}
for(i = i1; (i > -1) && (i < K); i += i3) {
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
......@@ -137,13 +136,13 @@ int CUDA_ztsmlq(
* Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
*/
CUDA_zparfb(
side, trans, ChamDirForward, ChamRowwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + i, LDV,
T + LDT*i, LDT,
WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
side, trans, ChamDirForward, ChamRowwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + i, LDV,
T + LDT*i, LDT,
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
......@@ -13,24 +13,24 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_ztsmqr(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM)
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NQ, NW;
......@@ -92,25 +92,24 @@ int CUDA_ztsmqr(
if (LDT < chameleon_max(1,IB)){
return -16;
}
if (LDWORK < chameleon_max(1,NW)){
return -18;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if (((side == ChamLeft) && (trans != ChamNoTrans))
|| ((side == ChamRight) && (trans == ChamNoTrans))) {
if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) ||
((side == ChamRight) && (trans == ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ((K-1) / IB)*IB;
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
for(i = i1; (i > -1) && (i < K); i += i3) {
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
......@@ -127,17 +126,18 @@ int CUDA_ztsmqr(
ni = N1 - i;
jc = i;
}
/*
* Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
*/
CUDA_zparfb(
side, trans, ChamDirForward, ChamColumnwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + LDV*i, LDV,
T + LDT*i, LDT,
WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
side, trans, ChamDirForward, ChamColumnwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + LDV*i, LDV,
T + LDT*i, LDT,
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
/**
*
* @file cuda_zttmlq.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_zttmlq GPU kernel
*
* @version 1.0.0
* @author Florent Pruvost
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_zttmlq(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NW;
int kb, l;
int ic = 0;
int jc = 0;
int mi1 = M1;
int mi2 = M2;
int ni1 = N1;
int ni2 = N2;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
return -1;
}
/* NQ is the order of Q */
if (side == ChamLeft) {
NW = IB;
}
else {
NW = N1;
}
if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
return -2;
}
if (M1 < 0) {
return -3;
}
if (N1 < 0) {
return -4;
}
if ( (M2 < 0) ||
( (M2 != M1) && (side == ChamRight) ) ){
return -5;
}
if ( (N2 < 0) ||
( (N2 != N1) && (side == ChamLeft) ) ){
return -6;
}
if ((K < 0) ||
( (side == ChamLeft) && (K > M1) ) ||
( (side == ChamRight) && (K > N1) ) ) {
return -7;
}
if (IB < 0) {
return -8;
}
if (LDA1 < chameleon_max(1,M1)){
return -10;
}
if (LDA2 < chameleon_max(1,M2)){
return -12;
}
if (LDV < chameleon_max(1,K)){
return -14;
}
if (LDT < chameleon_max(1,IB)){
return -16;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
((side == ChamRight) && (trans != ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
/* Transpose */
if (trans == ChamNoTrans) {
trans = ChamConjTrans;
}
else {
trans = ChamNoTrans;
}
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
mi1 = kb;
mi2 = chameleon_min(i+kb, M2);
l = chameleon_min(kb, chameleon_max(0, M2-i));
ic = i;
}
else {
ni1 = kb;
ni2 = chameleon_min(i+kb, N2);
l = chameleon_min(kb, chameleon_max(0, N2-i));
jc = i;
}
/*
* Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb)
*/
CUDA_zparfb(
side, trans, ChamDirForward, ChamRowwise,
mi1, ni1, mi2, ni2, kb, l,
A1 + LDA1 * jc + ic, LDA1,
A2, LDA2,
V + i, LDV,
T + LDT * i, LDT,
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
......@@ -6,6 +6,7 @@
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_zttmqr GPU kernel
......@@ -13,7 +14,7 @@
* @version 1.0.0
* @author Florent Pruvost
* @author Mathieu Faverge
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
......@@ -28,13 +29,12 @@ int CUDA_zttmqr(
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3, l;
int i, i1, i3;
int NQ, NW;
int kb;
int kb, l;
int ic = 0;
int jc = 0;
int mi1 = M1;
......@@ -94,25 +94,24 @@ int CUDA_zttmqr(
if (LDT < chameleon_max(1,IB)){
return -16;
}
if (LDWORK < chameleon_max(1,NW)){
return -18;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if (((side == ChamLeft) && (trans != ChamNoTrans))
|| ((side == ChamRight) && (trans == ChamNoTrans))) {
if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) ||
((side == ChamRight) && (trans == ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ((K-1) / IB)*IB;
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
for(i = i1; (i > -1) && (i < K); i += i3) {
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
......@@ -138,8 +137,7 @@ int CUDA_zttmqr(
A2, LDA2,
V + LDV*i, LDV,
T + LDT*i, LDT,
WORK, LDWORK,
WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
......@@ -13,7 +13,7 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
......@@ -31,16 +31,18 @@ int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleCom
int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM );
int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
int CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM );
int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
int CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
int CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment