Commit f343a4fa authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Add missing tpmlqt kernel for GPU, and fix the parfb kernel to handle the TT cases

parent 5ed9ade8
......@@ -19,7 +19,7 @@
#
# @version 1.0.0
# @author Florent Pruvost
# @date 2015-09-16
# @date 2018-11-09
#
###
......@@ -38,11 +38,13 @@ set(ZSRC
cuda_zsymm.c
cuda_zsyr2k.c
cuda_zsyrk.c
cuda_ztpmlqt.c
cuda_ztpmqrt.c
cuda_ztrmm.c
cuda_ztrsm.c
cuda_ztsmlq.c
cuda_ztsmqr.c
cuda_zttmlq.c
cuda_zttmqr.c
cuda_zunmlqt.c
cuda_zunmqrt.c
......
......@@ -15,21 +15,21 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int
CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
......@@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
}
/* Quick return */
if ((M == 0) || (N == 0) || (K == 0))
if ((M == 0) || (N == 0) || (K == 0)) {
return CHAMELEON_SUCCESS;
}
// opposite of trans
if (trans == ChamNoTrans)
if (trans == ChamNoTrans) {
transT = ChamConjTrans;
else
}
else {
transT = ChamNoTrans;
}
// whether T is upper or lower triangular
if (direct == ChamDirForward)
if (direct == ChamDirForward) {
uplo = ChamUpper;
else
}
else {
uplo = ChamLower;
}
if (storev == ChamColumnwise) {
notransV = ChamNoTrans;
......@@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T^H = C^H V T^H
CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
N, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
......@@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T = C V T
CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
M, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
......
......@@ -13,7 +13,7 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
......@@ -120,40 +120,32 @@
* The leading dimension of the array T. LDT >= K.
*
* @param[in,out] WORK
* Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K
* otherwise.
* Workspace of dimension at least:
* - K * (M2 + N2).
* If L > 0, it is recommended to extend it to
* - K * (2 * M2 + N2 ) if side == ChamLeft.
* - K * (M2 + 2 * N2 ) if side == ChamRight.
*
* @param[in] LDWORK
* The leading dimension of the array WORK: LDWORK >= K, if side ==
* ChamLeft, LDWORK >= M1 otehrwise.
*
* @param[in,out] WORKC
* Optionnal additional workspace to replace the TRMM operation by a GEMM kernel.
* This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2
* otherwise.
*
* @param[in] LDWORKC
* The leading dimension of the array WORKC: LDWORKC >= M2, if side ==
* ChamLeft, LDWORK >= K otehrwise.
* @param[in] LWORK
* The dimension of the array WORK. If LWORK < 0, returns immediately
* the recommended workspace size.
*
*******************************************************************************
*
* @return
* \retval CHAMELEON_SUCCESS successful exit
* \retval <0 if -i, the i-th argument had an illegal value
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
* @retval The recommended LWORK value, if LWORK == -1 on entry.
*/
int
CUDA_zparfb(cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M1, int N1, int M2, int N2, int K, int L,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM )
CUDA_zparfb( cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M1, int N1, int M2, int N2, int K, int L,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM )
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
......@@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
double mzone = -1.0;
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
cuDoubleComplex *workW, *workC, *workV;
int ldW, ldC, ldV;
int j;
cham_trans_t transW;
cham_trans_t transA2;
int wssize = 0;
int wrsize = 0;
CUBLAS_GET_STREAM;
......@@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
if (K < 0) {
return -9;
}
if ( ((LDWORK < K ) && (side == ChamLeft )) ||
((LDWORK < M1) && (side == ChamRight)) ) {
if (direct == ChamDirForward) {
wssize = K * (M2 + N2);
wrsize = wssize;
if ( L > 0 ) {
wrsize += (side == ChamLeft) ? M2 * K : K * N2;
}
}
if ( LWORK < 0 ) {
return wrsize;
}
else if ( LWORK < wssize ) {
cudablas_error(20, "Illegal value of LWORK");
return -20;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) {
return CHAMELEON_SUCCESS;
}
if (direct == ChamDirForward) {
if (side == ChamLeft) {
/*
* Column or Rowwise / Forward / Left
* ----------------------------------
......@@ -222,76 +229,116 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
* ( A2 )
*/
/*
* Store in WORK (N1 == N2):
* - Workspace W for the copy of A1 + V' * A2 (K x N1)
* - Workspace C for the copy of V * T (M2 x K )
* - Workspace V for the copy of V (M2 x K )
*/
workW = WORK;
ldW = K;
workC = workW + K * N1;
ldC = M2;
if ( L == 0 ) {
workV = (cuDoubleComplex*)V;
ldV = LDV;
}
else {
if ( LWORK < wrsize ) {
workC = NULL;
workV = workW + K * N1;
}
else {
workV = workC + M2 * K;
}
ldV = M2;
/*
* Backup V, and put 0 in the lower part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
M2 * sizeof(cuDoubleComplex), K,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
0.,
(L - j) * sizeof(cuDoubleComplex),
stream );
}
}
/*
* W = A1 + V' * A2:
* W = A1
* W = W + V' * A2
*
*/
cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), N1,
cudaMemcpyDeviceToDevice, stream );
transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
K, N1, M2,
CUBLAS_SADDR(zone),
V /* K*M2 */, LDV,
A2 /* M2*N1 */, LDA2,
CUBLAS_SADDR(zone),
WORK /* K*N1 */, LDWORK);
if (WORKC == NULL) {
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
K, N1, M2,
CUBLAS_SADDR(zone), workV /* M2*K */, ldV,
A2 /* M2*N2 */, LDA2,
CUBLAS_SADDR(zone), workW /* K *N2 */, ldW );
if ( workC == NULL ) {
/* W = op(T) * W */
CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit,
K, N2,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
workW, ldW,
CUBLAS_STREAM_VALUE );
/* A1 = A1 - W = A1 - op(T) * W */
for(j = 0; j < N1; j++) {
cublasZaxpy(CUBLAS_HANDLE
K, CUBLAS_SADDR(mzone),
(WORK + LDWORK*j), 1,
(A1 + LDA1*j), 1);
cublasZaxpy( CUBLAS_HANDLE
K, CUBLAS_SADDR(mzone),
workW + ldW * j, 1,
A1 + LDA1 * j, 1 );
}
/* A2 = A2 - op(V) * W */
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), V /* M2*K */, LDV,
WORK /* K*N2 */, LDWORK,
CUBLAS_SADDR(zone), A2 /* m2*N2 */, LDA2);
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), V /* M2 * K */, LDV,
workW /* K * N2 */, ldW,
CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 );
} else {
/* Wc = V * op(T) */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(transA2), chameleon_cublas_const(trans),
M2, K, K,
CUBLAS_SADDR(zone), V, LDV,
T, LDT,
CUBLAS_SADDR(zzero), WORKC, LDWORKC );
CUBLAS_SADDR(zone), workV, ldV,
T, LDT,
CUBLAS_SADDR(zzero), workC, ldC );
/* A1 = A1 - opt(T) * W */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
K, N1, K,
CUBLAS_SADDR(mzone), T, LDT,
WORK, LDWORK,
CUBLAS_SADDR(zone), A1, LDA1 );
CUBLAS_SADDR(mzone), T, LDT,
workW, ldW,
CUBLAS_SADDR(zone), A1, LDA1 );
/* A2 = A2 - Wc * W */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), WORKC, LDWORKC,
WORK, LDWORK,
CUBLAS_SADDR(mzone), workC, ldC,
workW, ldW,
CUBLAS_SADDR(zone), A2, LDA2 );
}
}
......@@ -304,14 +351,56 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
*
*/
/*
* Store in WORK (M1 == M2):
* - Workspace W for the copy of A1 + A2 * V' (M1 x K )
* - Workspace C for the copy of V * T (K x N2)
* - Workspace V for the copy of V (K x N2)
*/
workW = WORK;
ldW = M1;
workC = workW + M1 * K;
ldC = K;
if ( L == 0 ) {
workV = (cuDoubleComplex*)V;
ldV = LDV;
}
else {
if ( LWORK < wrsize ) {
workC = NULL;
workV = workW + M2 * K;
}
else {
workV = workC + K * N2;
}
ldV = K;
/*
* Backup V, and put 0 in the upper part
*/
cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
V, LDV * sizeof(cuDoubleComplex),
K * sizeof(cuDoubleComplex), N2,
cudaMemcpyDeviceToDevice, stream );
for(j = 1; j < K; j++) {
cudaMemsetAsync( workV + ldV + N2 - L + j,
0.,
j * sizeof(cuDoubleComplex),
stream );
}
}
/*
* W = A1 + A2 * V':
* W = A1
* W = W + A2 * V'
*
*/
cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
cudaMemcpy2DAsync( workW, ldW * sizeof(cuDoubleComplex),
A1, LDA1 * sizeof(cuDoubleComplex),
M1 * sizeof(cuDoubleComplex), K,
cudaMemcpyDeviceToDevice, stream );
......@@ -321,40 +410,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW),
M1, K, N2,
CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2,
V /* N2*K */, LDV,
CUBLAS_SADDR(zone), WORK /* M1*K */, LDWORK);
CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2,
workV /* K *N2 */, ldV,
CUBLAS_SADDR(zone), workW /* M1*K */, ldW);
if (WORKC == NULL) {
if ( workC == NULL ) {
/* W = W * op(T) */
CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit,
M2, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
workW, ldW,
CUBLAS_STREAM_VALUE );
/* A1 = A1 - W = A1 - W * op(T) */
for(j = 0; j < K; j++) {
cublasZaxpy(CUBLAS_HANDLE
M1, CUBLAS_SADDR(mzone),
(WORK + LDWORK*j), 1,
(A1 + LDA1*j), 1);
cublasZaxpy( CUBLAS_HANDLE
M1, CUBLAS_SADDR(mzone),
workW + ldW * j, 1,
A1 + LDA1 * j, 1 );
}
/* A2 = A2 - W * op(V) */
cublasZgemm(CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
M2, N2, K,
CUBLAS_SADDR(mzone), WORK /* M2*K */, LDWORK,
V /* K*N2 */, LDV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
CUBLAS_SADDR(mzone), workW /* M2*K */, ldW,
V /* K *N2 */, LDV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
} else {
/* A1 = A1 - W * opt(T) */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans),
M1, K, K,
CUBLAS_SADDR(mzone), WORK, LDWORK,
CUBLAS_SADDR(mzone), workW, ldW,
T, LDT,
CUBLAS_SADDR(zone), A1, LDA1 );
......@@ -363,15 +452,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
chameleon_cublas_const(trans), chameleon_cublas_const(transA2),
K, N2, K,
CUBLAS_SADDR(zone), T, LDT,
V, LDV,
CUBLAS_SADDR(zzero), WORKC, LDWORKC );
workV, ldV,
CUBLAS_SADDR(zzero), workC, ldC );
/* A2 = A2 - W * Wc */
cublasZgemm( CUBLAS_HANDLE
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K,
CUBLAS_SADDR(mzone), WORK, LDWORK,
WORKC, LDWORKC,
CUBLAS_SADDR(mzone), workW, ldW,
workC, ldC,
CUBLAS_SADDR(zone), A2, LDA2 );
}
}
......
/**
*
* @file cuda_ztpmlqt.c
*
* @copyright 2009-2016 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_ztpmlqt GPU kernel
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
/**
*******************************************************************************
*
* @ingroup CORE_CHAMELEON_Complex64_t
*
* @brief Applies a complex orthogonal matrix Q.
*
* The matrix Q is obtained from a "triangular-pentagonal" complex block
* reflector H to a general complex matrix C, which consists of two blocks A and
* B.
*
*******************************************************************************
*
* @param[in] side
* @arg ChamLeft : apply Q or Q**H from the Left;
* @arg ChamRight : apply Q or Q**H from the Right.
*
* @param[in] trans
* @arg ChamNoTrans : No transpose, apply Q;
* @arg ChamConjTrans : ConjTranspose, apply Q**H.
*
* @param[in] M
* The number of rows of the tile B. M >= 0.
*
* @param[in] N
* The number of columns of the tile B. N >= 0.
*
* @param[in] K
* The number of elementary reflectors whose product defines
* the matrix Q.
*
* @param[in] L
* The number of rows of the upper trapezoidal part of V.
* K >= L >= 0. See Further Details.
*
* @param[in] IB
* The inner-blocking size. IB >= 0.
*
* @param[in] V
* The i-th row must contain the vector which defines the
* elementary reflector H(i), for i = 1,2,...,k, as returned by
* CORE_ZTPQRT in the first k rows of its array argument V.
*
* @param[in] LDV
* The leading dimension of the array V. LDV >= max(1,K).
*
* @param[in] T
* The IB-by-N1 triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
* @param[in] LDT
* The leading dimension of the array T. LDT >= IB.
*
* @param[in,out] A
* A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
* or (LDA,K) if SIDE = ChamRight
* On entry, the K-by-N or M-by-K matrix A.
* On exit, A is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDA
* The leading dimension of the array A. LDA >= max(1,M).
* If side = ChamLeft, LDA >= max(1,K);
* If side = Chamright, LDA >= max(1,M).
*
* @param[in,out] B
* On entry, the M-by-N tile B.
* On exit, B is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDB
* The leading dimension of the tile B. LDB >= max(1,M).
*
* @param[out] WORK
* Workspace array of size LDWORK-by-NB.
* LDWORK = N if side = ChamLeft, or M if side = ChamRight.
*
*******************************************************************************
*
* @par Further Details:
* =====================
*
* The columns of the pentagonal matrix V contain the elementary reflectors
* H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
* trapezoidal block V2:
*