Commit f343a4fa authored by Mathieu Faverge's avatar Mathieu Faverge

Add missing tpmlqt kernel for GPU, and fix the parfb kernel to handle the TT cases

parent 5ed9ade8
......@@ -19,7 +19,7 @@
#
# @version 1.0.0
# @author Florent Pruvost
# @date 2015-09-16
# @date 2018-11-09
#
###
......@@ -38,11 +38,13 @@ set(ZSRC
cuda_zsymm.c
cuda_zsyr2k.c
cuda_zsyrk.c
cuda_ztpmlqt.c
cuda_ztpmqrt.c
cuda_ztrmm.c
cuda_ztrsm.c
cuda_ztsmlq.c
cuda_ztsmqr.c
cuda_zttmlq.c
cuda_zttmqr.c
cuda_zunmlqt.c
cuda_zunmqrt.c
......
......@@ -15,21 +15,21 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int
CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
cham_dir_t direct, cham_store_t storev,
int M, int N, int K,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM )
{
#if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
......@@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
}
/* Quick return */
if ((M == 0) || (N == 0) || (K == 0))
if ((M == 0) || (N == 0) || (K == 0)) {
return CHAMELEON_SUCCESS;
}
// opposite of trans
if (trans == ChamNoTrans)
if (trans == ChamNoTrans) {
transT = ChamConjTrans;
else
}
else {
transT = ChamNoTrans;
}
// whether T is upper or lower triangular
if (direct == ChamDirForward)
if (direct == ChamDirForward) {
uplo = ChamUpper;
else
}
else {
uplo = ChamLower;
}
if (storev == ChamColumnwise) {
notransV = ChamNoTrans;
......@@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T^H = C^H V T^H
CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
N, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
......@@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
// W = W T = C V T
CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
M, K,
CUBLAS_SADDR(zone), T, LDT,
WORK, LDWORK,
&zone, T, LDT,
WORK, LDWORK,
CUBLAS_STREAM_VALUE );
// C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
......
This diff is collapsed.
/**
*
* @file cuda_ztpmlqt.c
*
* @copyright 2009-2016 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_ztpmlqt GPU kernel
*
* @version 1.0.0
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
/**
*******************************************************************************
*
* @ingroup CORE_CHAMELEON_Complex64_t
*
* @brief Applies a complex orthogonal matrix Q.
*
* The matrix Q is obtained from a "triangular-pentagonal" complex block
* reflector H to a general complex matrix C, which consists of two blocks A and
* B.
*
*******************************************************************************
*
* @param[in] side
* @arg ChamLeft : apply Q or Q**H from the Left;
* @arg ChamRight : apply Q or Q**H from the Right.
*
* @param[in] trans
* @arg ChamNoTrans : No transpose, apply Q;
* @arg ChamConjTrans : ConjTranspose, apply Q**H.
*
* @param[in] M
* The number of rows of the tile B. M >= 0.
*
* @param[in] N
* The number of columns of the tile B. N >= 0.
*
* @param[in] K
* The number of elementary reflectors whose product defines
* the matrix Q.
*
* @param[in] L
* The number of rows of the upper trapezoidal part of V.
* K >= L >= 0. See Further Details.
*
* @param[in] IB
* The inner-blocking size. IB >= 0.
*
* @param[in] V
* The i-th row must contain the vector which defines the
* elementary reflector H(i), for i = 1,2,...,k, as returned by
* CORE_ZTPQRT in the first k rows of its array argument V.
*
* @param[in] LDV
* The leading dimension of the array V. LDV >= max(1,K).
*
* @param[in] T
* The IB-by-N1 triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
* @param[in] LDT
* The leading dimension of the array T. LDT >= IB.
*
* @param[in,out] A
* A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
* or (LDA,K) if SIDE = ChamRight
* On entry, the K-by-N or M-by-K matrix A.
* On exit, A is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDA
* The leading dimension of the array A. LDA >= max(1,M).
* If side = ChamLeft, LDA >= max(1,K);
* If side = Chamright, LDA >= max(1,M).
*
* @param[in,out] B
* On entry, the M-by-N tile B.
* On exit, B is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDB
* The leading dimension of the tile B. LDB >= max(1,M).
*
* @param[out] WORK
* Workspace array of size LDWORK-by-NB.
* LDWORK = N if side = ChamLeft, or M if side = ChamRight.
*
*******************************************************************************
*
* @par Further Details:
* =====================
*
* The columns of the pentagonal matrix V contain the elementary reflectors
* H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
* trapezoidal block V2:
*
* V = [V1] [V2].
*
* The size of the trapezoidal block V2 is determined by the parameter L,
* where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L
* rows of a K-by-K upper triangular matrix. If L=K, V2 is lower triangular;
* if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
*
* If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is K-by-M.
* [B]
*
* If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is K-by-N.
*
* The complex orthogonal matrix Q is formed from V and T.
*
* If trans='N' and side='L', C is on exit replaced with Q * C.
*
* If trans='C' and side='L', C is on exit replaced with Q**H * C.
*
* If trans='N' and side='R', C is on exit replaced with C * Q.
*
* If trans='C' and side='R', C is on exit replaced with C * Q**H.
*
*******************************************************************************
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
*
*/
int
CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int IB,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM )
{
int m1, n1;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
cudablas_error(1, "Illegal value of side");
return -1;
}
if ( side == ChamLeft ) {
m1 = K;
n1 = N;
}
else {
m1 = M;
n1 = K;
}
/* TS case */
if (L == 0) {
CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
/* TT case */
else if( L == N ) {
CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
else {
cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" );
return -6;
}
return CHAMELEON_SUCCESS;
}
......@@ -13,12 +13,128 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
/**
*******************************************************************************
*
* @ingroup CORE_CHAMELEON_Complex64_t
*
* @brief Applies a complex orthogonal matrix Q.
*
* The matrix Q is obtained from a "triangular-pentagonal" complex block
* reflector H to a general complex matrix C, which consists of two blocks A and
* B.
*
*******************************************************************************
*
* @param[in] side
* @arg ChamLeft : apply Q or Q**H from the Left;
* @arg ChamRight : apply Q or Q**H from the Right.
*
* @param[in] trans
* @arg ChamNoTrans : No transpose, apply Q;
* @arg ChamConjTrans : ConjTranspose, apply Q**H.
*
* @param[in] M
* The number of rows of the tile B. M >= 0.
*
* @param[in] N
* The number of columns of the tile B. N >= 0.
*
* @param[in] K
* The number of elementary reflectors whose product defines
* the matrix Q.
*
* @param[in] L
* The number of rows of the upper trapezoidal part of V.
* K >= L >= 0. See Further Details.
*
* @param[in] IB
* The inner-blocking size. IB >= 0.
*
* @param[in] V
* The i-th row must contain the vector which defines the
* elementary reflector H(i), for i = 1,2,...,k, as returned by
* CORE_ZTPQRT in the first k rows of its array argument V.
*
* @param[in] LDV
* The leading dimension of the array V. LDV >= max(1,K).
*
* @param[in] T
* The IB-by-N1 triangular factor T of the block reflector.
* T is upper triangular by block (economic storage);
* The rest of the array is not referenced.
*
* @param[in] LDT
* The leading dimension of the array T. LDT >= IB.
*
* @param[in,out] A
* A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
* or (LDA,K) if SIDE = ChamRight
* On entry, the K-by-N or M-by-K matrix A.
* On exit, A is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDA
* The leading dimension of the array A. LDA >= max(1,M).
* If side = ChamLeft, LDA >= max(1,K);
* If side = Chamright, LDA >= max(1,M).
*
* @param[in,out] B
* On entry, the M-by-N tile B.
* On exit, B is overwritten by the corresponding block of
* Q*C or Q**H*C or C*Q or C*Q**H. See Further Details.
*
* @param[in] LDB
* The leading dimension of the tile B. LDB >= max(1,M).
*
* @param[out] WORK
* Workspace array of size LDWORK-by-NB.
* LDWORK = N if side = ChamLeft, or M if side = ChamRight.
*
*******************************************************************************
*
* @par Further Details:
* =====================
*
* The columns of the pentagonal matrix V contain the elementary reflectors
* H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
* trapezoidal block V2:
*
* V = [V1]
* [V2].
*
* The size of the trapezoidal block V2 is determined by the parameter L,
* where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L
* rows of a K-by-K upper triangular matrix. If L=K, V2 is upper triangular;
* if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
*
* If side = ChamLeft: C = [A] where A is K-by-N, B is M-by-N and V is M-by-K.
* [B]
*
* If side = ChamRight: C = [A B] where A is M-by-K, B is M-by-N and V is N-by-K.
*
* The complex orthogonal matrix Q is formed from V and T.
*
* If trans='N' and side='L', C is on exit replaced with Q * C.
*
* If trans='C' and side='L', C is on exit replaced with Q**H * C.
*
* If trans='N' and side='R', C is on exit replaced with C * Q.
*
* If trans='C' and side='R', C is on exit replaced with C * Q**H.
*
*******************************************************************************
*
* @retval CHAMELEON_SUCCESS successful exit
* @retval <0 if -i, the i-th argument had an illegal value
*
*/
int
CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
int M, int N, int K, int L, int IB,
......@@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK,
cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM )
{
int m1, n1, ldwork, ldworkc, ws;
int m1, n1;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
......@@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
if ( side == ChamLeft ) {
m1 = K;
n1 = N;
ldwork = IB;
ldworkc = M;
ws = ldwork * n1;
}
else {
m1 = M;
n1 = K;
ldwork = chameleon_max( K, chameleon_max( M, N ) );
ldworkc = IB;
ws = ldwork * IB;
}
/* TS case */
if (L == 0) {
CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, ldwork, WORK + ws, ldworkc,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
/* TT case */
else if( L == M ) {
CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT,
WORK, ldwork, WORK + ws, ldworkc,
WORK, lwork,
CUBLAS_STREAM_VALUE );
}
else {
......
......@@ -13,24 +13,24 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_ztsmlq(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM)
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NW;
......@@ -90,21 +90,20 @@ int CUDA_ztsmlq(
if (LDT < chameleon_max(1,IB)){
return -16;
}
if (LDWORK < chameleon_max(1,NW)){
return -18;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if (((side == ChamLeft) && (trans == ChamNoTrans))
|| ((side == ChamRight) && (trans != ChamNoTrans))) {
if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
((side == ChamRight) && (trans != ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ((K-1) / IB)*IB;
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
......@@ -115,7 +114,7 @@ int CUDA_ztsmlq(
trans = ChamNoTrans;
}
for(i = i1; (i > -1) && (i < K); i += i3) {
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
......@@ -137,13 +136,13 @@ int CUDA_ztsmlq(
* Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
*/
CUDA_zparfb(
side, trans, ChamDirForward, ChamRowwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + i, LDV,
T + LDT*i, LDT,
WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
side, trans, ChamDirForward, ChamRowwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + i, LDV,
T + LDT*i, LDT,
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
......@@ -13,24 +13,24 @@
*
* @version 1.0.0
* @author Florent Pruvost
* @date 2015-09-16
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_ztsmqr(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LDWORK,
cuDoubleComplex *WORKC, int LDWORKC,
CUBLAS_STREAM_PARAM)
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NQ, NW;
......@@ -92,25 +92,24 @@ int CUDA_ztsmqr(
if (LDT < chameleon_max(1,IB)){
return -16;
}
if (LDWORK < chameleon_max(1,NW)){
return -18;
}
/* Quick return */
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
return CHAMELEON_SUCCESS;
}
if (((side == ChamLeft) && (trans != ChamNoTrans))
|| ((side == ChamRight) && (trans == ChamNoTrans))) {
if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) ||
((side == ChamRight) && (trans == ChamNoTrans)) )
{
i1 = 0;
i3 = IB;
}
else {
i1 = ((K-1) / IB)*IB;
i1 = ( ( K-1 ) / IB )*IB;
i3 = -IB;
}
for(i = i1; (i > -1) && (i < K); i += i3) {
for (i = i1; (i > -1) && (i < K); i+=i3) {
kb = chameleon_min(IB, K-i);
if (side == ChamLeft) {
......@@ -127,17 +126,18 @@ int CUDA_ztsmqr(
ni = N1 - i;
jc = i;
}
/*
* Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
*/
CUDA_zparfb(
side, trans, ChamDirForward, ChamColumnwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + LDV*i, LDV,
T + LDT*i, LDT,
WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
side, trans, ChamDirForward, ChamColumnwise,
mi, ni, M2, N2, kb, 0,
A1 + LDA1*jc+ic, LDA1,
A2, LDA2,
V + LDV*i, LDV,
T + LDT*i, LDT,
WORK, LWORK, CUBLAS_STREAM_VALUE );
}
return CHAMELEON_SUCCESS;
}
/**
*
* @file cuda_zttmlq.c
*
* @copyright 2009-2014 The University of Tennessee and The University of
* Tennessee Research Foundation. All rights reserved.
* @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
* Univ. Bordeaux. All rights reserved.
*
***
*
* @brief Chameleon cuda_zttmlq GPU kernel
*
* @version 1.0.0
* @author Florent Pruvost
* @author Mathieu Faverge
* @date 2018-11-09
* @precisions normal z -> c d s
*
*/
#include "cudablas.h"
int CUDA_zttmlq(
cham_side_t side, cham_trans_t trans,
int M1, int N1,
int M2, int N2,
int K, int IB,
cuDoubleComplex *A1, int LDA1,
cuDoubleComplex *A2, int LDA2,
const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM)
{
int i, i1, i3;
int NW;
int kb, l;
int ic = 0;
int jc = 0;
int mi1 = M1;
int mi2 = M2;
int ni1 = N1;
int ni2 = N2;
/* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) {
return -1;
}
/* NQ is the order of Q */
if (side == ChamLeft) {
NW = IB;
}
else {
NW = N1;
}
if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
return -2;
}
if (M1 < 0) {
return -3;
}
if (N1 < 0) {
return -4;
}
if ( (M2 < 0) ||
( (M2 != M1) && (side == ChamRight) ) ){
return -5;
}
if ( (N2 < 0) ||
( (N2 != N1) && (side == ChamLeft) ) ){
return -6;
}
if ((K < 0) ||
( (side == ChamLeft) && (K > M1) ) ||
( (side == ChamRight) && (K > N1) ) ) {
return -7;