Mentions légales du service

Skip to content
Snippets Groups Projects
Commit 15d50172 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'cuda/check_return_value' into 'master'

Check return value of then CUDA call in debug mode

See merge request !289
parents 4ed75fb8 d1d730f0
No related branches found
No related tags found
1 merge request!289Check return value of then CUDA call in debug mode
Showing
with 325 additions and 322 deletions
...@@ -520,12 +520,6 @@ int main(void) { ...@@ -520,12 +520,6 @@ int main(void) {
HAVE_FALLTHROUGH HAVE_FALLTHROUGH
) )
# Add option to exploit cublas API v2
# -----------------------------------
cmake_dependent_option(CHAMELEON_USE_CUBLAS_V2
"Enable cublas API v2" ON
"CHAMELEON_USE_CUDA;CHAMELEON_SCHED_STARPU" OFF)
# Fix a problem on Mac OS X when building shared libraries # Fix a problem on Mac OS X when building shared libraries
if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
set(CMAKE_SHARED_LINKER_FLAGS "-undefined dynamic_lookup") set(CMAKE_SHARED_LINKER_FLAGS "-undefined dynamic_lookup")
......
...@@ -48,12 +48,8 @@ ...@@ -48,12 +48,8 @@
#include <cuda.h> #include <cuda.h>
#include <cuda_runtime.h> #include <cuda_runtime.h>
#include <cuda_runtime_api.h> #include <cuda_runtime_api.h>
#if defined(CHAMELEON_USE_CUBLAS_V2)
#include <cublas.h> #include <cublas.h>
#include <cublas_v2.h> #include <cublas_v2.h>
#else
#include <cublas.h>
#endif
#endif #endif
#if defined(CHAMELEON_USE_OPENCL) && !defined(CHAMELEON_SIMULATION) #if defined(CHAMELEON_USE_OPENCL) && !defined(CHAMELEON_SIMULATION)
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
# ------------------------------------------------------ # ------------------------------------------------------
set(CUDABLAS_SRCS_GENERATED "") set(CUDABLAS_SRCS_GENERATED "")
set(ZSRC set(ZSRC
cuda_zgeadd.c
cuda_zgemerge.c cuda_zgemerge.c
cuda_zgemm.c cuda_zgemm.c
cuda_zhemm.c cuda_zhemm.c
...@@ -52,13 +53,6 @@ set(ZSRC ...@@ -52,13 +53,6 @@ set(ZSRC
cuda_zunmqrt.c cuda_zunmqrt.c
) )
if( CHAMELEON_USE_CUBLAS_V2 )
set(ZSRC
${ZSRC}
cuda_zgeadd.c
)
endif( CHAMELEON_USE_CUBLAS_V2 )
# Former MAGMA files that are no longer supported # Former MAGMA files that are no longer supported
# if( CHAMELEON_USE_MAGMA ) # if( CHAMELEON_USE_MAGMA )
# set(ZSRC # set(ZSRC
...@@ -83,15 +77,9 @@ precisions_rules_py( ...@@ -83,15 +77,9 @@ precisions_rules_py(
set(CUDABLAS_SRCS set(CUDABLAS_SRCS
${CUDABLAS_SRCS_GENERATED} ${CUDABLAS_SRCS_GENERATED}
cudaglobal.c
) )
if (CHAMELEON_USE_CUBLAS_V2)
set(CUDABLAS_SRCS
${CUDABLAS_SRCS}
cudaglobal.c
)
endif (CHAMELEON_USE_CUBLAS_V2)
# Force generation of sources # Force generation of sources
# --------------------------- # ---------------------------
add_custom_target(cudablas_sources ALL SOURCES ${CUDABLAS_SRCS}) add_custom_target(cudablas_sources ALL SOURCES ${CUDABLAS_SRCS})
......
...@@ -19,10 +19,6 @@ ...@@ -19,10 +19,6 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
#if !defined(CHAMELEON_USE_CUBLAS_V2)
#error "This file requires cublas api v2 support"
#endif
/** /**
****************************************************************************** ******************************************************************************
* *
...@@ -76,22 +72,24 @@ ...@@ -76,22 +72,24 @@
* @retval <0 if -i, the i-th argument had an illegal value * @retval <0 if -i, the i-th argument had an illegal value
* *
*/ */
int CUDA_zgeadd(cham_trans_t trans, int CUDA_zgeadd( cham_trans_t trans,
int m, int n, int m, int n,
const cuDoubleComplex *alpha, const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *beta, const cuDoubleComplex *beta,
cuDoubleComplex *B, int ldb, cuDoubleComplex *B, int ldb,
CUBLAS_STREAM_PARAM) cublasHandle_t handle )
{ {
cublasZgeam(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
m, n,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), B, ldb,
B, ldb);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZgeam( handle,
chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
m, n,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), B, ldb,
B, ldb );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -25,11 +25,12 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, ...@@ -25,11 +25,12 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag,
int M, int N, int M, int N,
const cuDoubleComplex *A, int LDA, const cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB, cuDoubleComplex *B, int LDB,
CUBLAS_STREAM_PARAM) cublasHandle_t handle )
{ {
int i;
const cuDoubleComplex *cola; const cuDoubleComplex *cola;
cuDoubleComplex *colb; cuDoubleComplex *colb;
cublasStatus_t rc;
int i;
if (M < 0) { if (M < 0) {
return -1; return -1;
...@@ -44,26 +45,23 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, ...@@ -44,26 +45,23 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag,
return -7; return -7;
} }
CUBLAS_GET_STREAM;
if (side == ChamLeft){ if (side == ChamLeft){
for(i=0; i<N; i++){ for(i=0; i<N; i++){
cola = A + i*LDA; cola = A + i*LDA;
colb = B + i*LDB; colb = B + i*LDB;
cudaMemcpyAsync(colb , cola, rc = cublasZcopy( handle, i+1, cola, 1, colb, 1 );
(i+1)*sizeof(cuDoubleComplex), assert( rc == CUBLAS_STATUS_SUCCESS );
cudaMemcpyDeviceToDevice, stream);
} }
}else{ }else{
for(i=0; i<N; i++){ for(i=0; i<N; i++){
cola = A + i*LDA; cola = A + i*LDA;
colb = B + i*LDB; colb = B + i*LDB;
cudaMemcpyAsync(colb+i , cola+i, rc = cublasZcopy( handle, M-i, cola, 1, colb, 1 );
(M-i)*sizeof(cuDoubleComplex), assert( rc == CUBLAS_STATUS_SUCCESS );
cudaMemcpyDeviceToDevice, stream);
} }
} }
(void)diag; (void)diag;
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -20,22 +20,26 @@ ...@@ -20,22 +20,26 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zgemm(cham_trans_t transa, cham_trans_t transb, int
int m, int n, int k, CUDA_zgemm( cham_trans_t transa, cham_trans_t transb,
const cuDoubleComplex *alpha, int m, int n, int k,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *beta, const cuDoubleComplex *B, int ldb,
cuDoubleComplex *C, int ldc, const cuDoubleComplex *beta,
CUBLAS_STREAM_PARAM) cuDoubleComplex *C, int ldc,
cublasHandle_t handle )
{ {
cublasZgemm(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(transa), chameleon_cublas_const(transb),
m, n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZgemm( handle,
chameleon_cublas_const(transa), chameleon_cublas_const(transb),
m, n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -21,7 +21,8 @@ ...@@ -21,7 +21,8 @@
#include "cudablas.h" #include "cudablas.h"
#if defined(CHAMELEON_USE_MAGMA) #if defined(CHAMELEON_USE_MAGMA)
int CUDA_zgeqrt( int
CUDA_zgeqrt(
magma_int_t m, magma_int_t n, magma_int_t nb, magma_int_t m, magma_int_t n, magma_int_t nb,
magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *da, magma_int_t ldda,
magmaDoubleComplex *v, magma_int_t ldv, magmaDoubleComplex *v, magma_int_t ldv,
...@@ -32,7 +33,7 @@ int CUDA_zgeqrt( ...@@ -32,7 +33,7 @@ int CUDA_zgeqrt(
magmaDoubleComplex *tau, magmaDoubleComplex *tau,
magmaDoubleComplex *hwork, magmaDoubleComplex *hwork,
magmaDoubleComplex *dwork, magmaDoubleComplex *dwork,
CUstream stream) CUstream stream )
{ {
#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) #define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
#define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) #define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1))
......
...@@ -20,22 +20,26 @@ ...@@ -20,22 +20,26 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zhemm(cham_side_t side, cham_uplo_t uplo, int
int m, int n, CUDA_zhemm( cham_side_t side, cham_uplo_t uplo,
const cuDoubleComplex *alpha, int m, int n,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *beta, const cuDoubleComplex *B, int ldb,
cuDoubleComplex *C, int ldc, const cuDoubleComplex *beta,
CUBLAS_STREAM_PARAM) cuDoubleComplex *C, int ldc,
cublasHandle_t handle )
{ {
cublasZhemm(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZhemm( handle,
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -20,23 +20,26 @@ ...@@ -20,23 +20,26 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zher2k(cham_uplo_t uplo, cham_trans_t trans, int
int n, int k, CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans,
const cuDoubleComplex *alpha, int n, int k,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *A, int lda,
const double *beta, const cuDoubleComplex *B, int ldb,
cuDoubleComplex *C, int ldc, const double *beta,
CUBLAS_STREAM_PARAM) cuDoubleComplex *C, int ldc,
cublasHandle_t handle )
{ {
cublasZher2k(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZher2k( handle,
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -27,7 +27,7 @@ CUDA_zherfb( cham_uplo_t uplo, int n, ...@@ -27,7 +27,7 @@ CUDA_zherfb( cham_uplo_t uplo, int n,
const cuDoubleComplex *T, int ldt, const cuDoubleComplex *T, int ldt,
cuDoubleComplex *C, int ldc, cuDoubleComplex *C, int ldc,
cuDoubleComplex *WORK, int ldwork, cuDoubleComplex *WORK, int ldwork,
CUBLAS_STREAM_PARAM ) cublasHandle_t handle )
{ {
/* Check input arguments */ /* Check input arguments */
if ((uplo != ChamUpper) && (uplo != ChamLower)) { if ((uplo != ChamUpper) && (uplo != ChamLower)) {
...@@ -67,21 +67,21 @@ CUDA_zherfb( cham_uplo_t uplo, int n, ...@@ -67,21 +67,21 @@ CUDA_zherfb( cham_uplo_t uplo, int n,
/* Left */ /* Left */
CUDA_zunmqrt( ChamLeft, ChamConjTrans, n, n, k, ib, CUDA_zunmqrt( ChamLeft, ChamConjTrans, n, n, k, ib,
A, lda, T, ldt, C, ldc, WORK, ldwork, A, lda, T, ldt, C, ldc, WORK, ldwork,
CUBLAS_STREAM_VALUE ); handle );
/* Right */ /* Right */
CUDA_zunmqrt( ChamRight, ChamNoTrans, n, n, k, ib, CUDA_zunmqrt( ChamRight, ChamNoTrans, n, n, k, ib,
A, lda, T, ldt, C, ldc, WORK, ldwork, A, lda, T, ldt, C, ldc, WORK, ldwork,
CUBLAS_STREAM_VALUE ); handle );
} }
else { else {
/* Right */ /* Right */
CUDA_zunmlqt( ChamRight, ChamConjTrans, n, n, k, ib, CUDA_zunmlqt( ChamRight, ChamConjTrans, n, n, k, ib,
A, lda, T, ldt, C, ldc, WORK, ldwork, A, lda, T, ldt, C, ldc, WORK, ldwork,
CUBLAS_STREAM_VALUE ); handle );
/* Left */ /* Left */
CUDA_zunmlqt( ChamLeft, ChamNoTrans, n, n, k, ib, CUDA_zunmlqt( ChamLeft, ChamNoTrans, n, n, k, ib,
A, lda, T, ldt, C, ldc, WORK, ldwork, A, lda, T, ldt, C, ldc, WORK, ldwork,
CUBLAS_STREAM_VALUE ); handle );
} }
return 0; return 0;
} }
...@@ -20,21 +20,24 @@ ...@@ -20,21 +20,24 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int
int n, int k, CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans,
const double *alpha, int n, int k,
const cuDoubleComplex *A, int lda, const double *alpha,
const double *beta, const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb, const double *beta,
CUBLAS_STREAM_PARAM) cuDoubleComplex *B, int ldb,
cublasHandle_t handle )
{ {
cublasZherk( CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), B, ldb);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZherk( handle,
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), B, ldb );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -30,7 +30,7 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, ...@@ -30,7 +30,7 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
const cuDoubleComplex *T, int LDT, const cuDoubleComplex *T, int LDT,
cuDoubleComplex *C, int LDC, cuDoubleComplex *C, int LDC,
cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORK, int LDWORK,
CUBLAS_STREAM_PARAM ) cublasHandle_t handle )
{ {
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
...@@ -42,8 +42,9 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, ...@@ -42,8 +42,9 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
double mzone = -1.0; double mzone = -1.0;
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
cham_trans_t transT, notransV, transV; cublasStatus_t rc;
cham_uplo_t uplo; cham_trans_t transT, notransV, transV;
cham_uplo_t uplo;
/* Check input arguments */ /* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) { if ((side != ChamLeft) && (side != ChamRight)) {
...@@ -103,54 +104,58 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, ...@@ -103,54 +104,58 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
// Comments assume H C. When forming H^H C, T gets transposed via transT. // Comments assume H C. When forming H^H C, T gets transposed via transT.
// W = C^H V // W = C^H V
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV),
N, K, M, N, K, M,
CUBLAS_SADDR(zone), C, LDC, CUBLAS_SADDR(zone), C, LDC,
V, LDV, V, LDV,
CUBLAS_SADDR(zzero), WORK, LDWORK ); CUBLAS_SADDR(zzero), WORK, LDWORK );
assert( rc == CUBLAS_STATUS_SUCCESS );
// W = W T^H = C^H V T^H // W = W T^H = C^H V T^H
CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit, CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
N, K, N, K,
&zone, T, LDT, &zone, T, LDT,
WORK, LDWORK, WORK, LDWORK,
CUBLAS_STREAM_VALUE ); handle );
// C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans),
M, N, K, M, N, K,
CUBLAS_SADDR(mzone), V, LDV, CUBLAS_SADDR(mzone), V, LDV,
WORK, LDWORK, WORK, LDWORK,
CUBLAS_SADDR(zone), C, LDC ); CUBLAS_SADDR(zone), C, LDC );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
else { else {
// Form C H or C H^H // Form C H or C H^H
// Comments assume C H. When forming C H^H, T gets transposed via trans. // Comments assume C H. When forming C H^H, T gets transposed via trans.
// W = C V // W = C V
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV),
M, K, N, M, K, N,
CUBLAS_SADDR(zone), C, LDC, CUBLAS_SADDR(zone), C, LDC,
V, LDV, V, LDV,
CUBLAS_SADDR(zzero), WORK, LDWORK ); CUBLAS_SADDR(zzero), WORK, LDWORK );
assert( rc == CUBLAS_STATUS_SUCCESS );
// W = W T = C V T // W = W T = C V T
CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit, CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
M, K, M, K,
&zone, T, LDT, &zone, T, LDT,
WORK, LDWORK, WORK, LDWORK,
CUBLAS_STREAM_VALUE ); handle );
// C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV),
M, N, K, M, N, K,
CUBLAS_SADDR(mzone), WORK, LDWORK, CUBLAS_SADDR(mzone), WORK, LDWORK,
V, LDV, V, LDV,
CUBLAS_SADDR(zone), C, LDC ); CUBLAS_SADDR(zone), C, LDC );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -146,7 +146,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -146,7 +146,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
const cuDoubleComplex *V, int LDV, const cuDoubleComplex *V, int LDV,
const cuDoubleComplex *T, int LDT, const cuDoubleComplex *T, int LDT,
cuDoubleComplex *WORK, int LWORK, cuDoubleComplex *WORK, int LWORK,
CUBLAS_STREAM_PARAM ) cublasHandle_t handle )
{ {
#if defined(PRECISION_z) || defined(PRECISION_c) #if defined(PRECISION_z) || defined(PRECISION_c)
cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
...@@ -159,14 +159,12 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -159,14 +159,12 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
#endif /* defined(PRECISION_z) || defined(PRECISION_c) */ #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
cuDoubleComplex *workW, *workC, *workV; cuDoubleComplex *workW, *workC, *workV;
int ldW, ldC, ldV; cublasStatus_t rc;
int j; cudaStream_t stream;
cham_trans_t transW; int j, ldW, ldC, ldV;
cham_trans_t transA2; cham_trans_t transW, transA2;
int wssize = 0; int wssize = 0;
int wrsize = 0; int wrsize = 0;
CUBLAS_GET_STREAM;
/* Check input arguments */ /* Check input arguments */
if ((side != ChamLeft) && (side != ChamRight)) { if ((side != ChamLeft) && (side != ChamRight)) {
...@@ -219,6 +217,8 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -219,6 +217,8 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
cublasGetStream( handle, &stream );
if (direct == ChamDirForward) { if (direct == ChamDirForward) {
if (side == ChamLeft) { if (side == ChamLeft) {
...@@ -307,12 +307,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -307,12 +307,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
K, N1, M2, K, N1, M2,
CUBLAS_SADDR(zone), workV /* M2*K */, ldV, CUBLAS_SADDR(zone), workV /* M2*K */, ldV,
A2 /* M2*N2 */, LDA2, A2 /* M2*N2 */, LDA2,
CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); CUBLAS_SADDR(zone), workW /* K *N2 */, ldW );
assert( rc == CUBLAS_STATUS_SUCCESS );
if ( workC == NULL ) { if ( workC == NULL ) {
/* W = op(T) * W */ /* W = op(T) * W */
...@@ -320,48 +321,52 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -320,48 +321,52 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
K, N2, K, N2,
&zone, T, LDT, &zone, T, LDT,
workW, ldW, workW, ldW,
CUBLAS_STREAM_VALUE ); handle );
/* A1 = A1 - W = A1 - op(T) * W */ /* A1 = A1 - W = A1 - op(T) * W */
for(j = 0; j < N1; j++) { for(j = 0; j < N1; j++) {
cublasZaxpy( CUBLAS_HANDLE rc = cublasZaxpy( handle,
K, CUBLAS_SADDR(mzone), K, CUBLAS_SADDR(mzone),
workW + ldW * j, 1, workW + ldW * j, 1,
A1 + LDA1 * j, 1 ); A1 + LDA1 * j, 1 );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
/* A2 = A2 - op(V) * W */ /* A2 = A2 - op(V) * W */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV,
workW /* K * N2 */, ldW, workW /* K * N2 */, ldW,
CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 );
assert( rc == CUBLAS_STATUS_SUCCESS );
} else { } else {
/* Wc = V * op(T) */ /* Wc = V * op(T) */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(transA2), chameleon_cublas_const(trans), chameleon_cublas_const(transA2), chameleon_cublas_const(trans),
M2, K, K, M2, K, K,
CUBLAS_SADDR(zone), workV, ldV, CUBLAS_SADDR(zone), workV, ldV,
T, LDT, T, LDT,
CUBLAS_SADDR(zzero), workC, ldC ); CUBLAS_SADDR(zzero), workC, ldC );
/* A1 = A1 - opt(T) * W */ /* A1 = A1 - opt(T) * W */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
K, N1, K, K, N1, K,
CUBLAS_SADDR(mzone), T, LDT, CUBLAS_SADDR(mzone), T, LDT,
workW, ldW, workW, ldW,
CUBLAS_SADDR(zone), A1, LDA1 ); CUBLAS_SADDR(zone), A1, LDA1 );
assert( rc == CUBLAS_STATUS_SUCCESS );
/* A2 = A2 - Wc * W */ /* A2 = A2 - Wc * W */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), workC, ldC, CUBLAS_SADDR(mzone), workC, ldC,
workW, ldW, workW, ldW,
CUBLAS_SADDR(zone), A2, LDA2 ); CUBLAS_SADDR(zone), A2, LDA2 );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
} }
else { else {
...@@ -450,12 +455,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -450,12 +455,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
transW = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; transW = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
transA2 = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transA2 = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
cublasZgemm(CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW),
M1, K, N2, M1, K, N2,
CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2,
workV /* K *N2 */, ldV, workV /* K *N2 */, ldV,
CUBLAS_SADDR(zone), workW /* M1*K */, ldW); CUBLAS_SADDR(zone), workW /* M1*K */, ldW);
assert( rc == CUBLAS_STATUS_SUCCESS );
if ( workC == NULL ) { if ( workC == NULL ) {
/* W = W * op(T) */ /* W = W * op(T) */
...@@ -463,48 +469,53 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, ...@@ -463,48 +469,53 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans,
M2, K, M2, K,
&zone, T, LDT, &zone, T, LDT,
workW, ldW, workW, ldW,
CUBLAS_STREAM_VALUE ); handle );
/* A1 = A1 - W = A1 - W * op(T) */ /* A1 = A1 - W = A1 - W * op(T) */
for(j = 0; j < K; j++) { for(j = 0; j < K; j++) {
cublasZaxpy( CUBLAS_HANDLE rc = cublasZaxpy( handle,
M1, CUBLAS_SADDR(mzone), M1, CUBLAS_SADDR(mzone),
workW + ldW * j, 1, workW + ldW * j, 1,
A1 + LDA1 * j, 1 ); A1 + LDA1 * j, 1 );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
/* A2 = A2 - W * op(V) */ /* A2 = A2 - W * op(V) */
cublasZgemm(CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, CUBLAS_SADDR(mzone), workW /* M2*K */, ldW,
workV /* K *N2 */, ldV, workV /* K *N2 */, ldV,
CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2);
assert( rc == CUBLAS_STATUS_SUCCESS );
} else { } else {
/* A1 = A1 - W * opt(T) */ /* A1 = A1 - W * opt(T) */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans),
M1, K, K, M1, K, K,
CUBLAS_SADDR(mzone), workW, ldW, CUBLAS_SADDR(mzone), workW, ldW,
T, LDT, T, LDT,
CUBLAS_SADDR(zone), A1, LDA1 ); CUBLAS_SADDR(zone), A1, LDA1 );
assert( rc == CUBLAS_STATUS_SUCCESS );
/* Wc = op(T) * V */ /* Wc = op(T) * V */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(trans), chameleon_cublas_const(transA2), chameleon_cublas_const(trans), chameleon_cublas_const(transA2),
K, N2, K, K, N2, K,
CUBLAS_SADDR(zone), T, LDT, CUBLAS_SADDR(zone), T, LDT,
workV, ldV, workV, ldV,
CUBLAS_SADDR(zzero), workC, ldC ); CUBLAS_SADDR(zzero), workC, ldC );
assert( rc == CUBLAS_STATUS_SUCCESS );
/* A2 = A2 - W * Wc */ /* A2 = A2 - W * Wc */
cublasZgemm( CUBLAS_HANDLE rc = cublasZgemm( handle,
chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
M2, N2, K, M2, N2, K,
CUBLAS_SADDR(mzone), workW, ldW, CUBLAS_SADDR(mzone), workW, ldW,
workC, ldC, workC, ldC,
CUBLAS_SADDR(zone), A2, LDA2 ); CUBLAS_SADDR(zone), A2, LDA2 );
assert( rc == CUBLAS_STATUS_SUCCESS );
} }
} }
} }
......
...@@ -20,23 +20,26 @@ ...@@ -20,23 +20,26 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int
int m, int n, CUDA_zsymm( cham_side_t side, cham_uplo_t uplo,
const cuDoubleComplex *alpha, int m, int n,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *beta, const cuDoubleComplex *B, int ldb,
cuDoubleComplex *C, int ldc, const cuDoubleComplex *beta,
CUBLAS_STREAM_PARAM ) cuDoubleComplex *C, int ldc,
cublasHandle_t handle )
{ {
cublasZsymm(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZsymm( handle,
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -20,24 +20,26 @@ ...@@ -20,24 +20,26 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zsyr2k( int
cham_uplo_t uplo, cham_trans_t trans, CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans,
int n, int k, int n, int k,
const cuDoubleComplex *alpha, const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *A, int lda,
const cuDoubleComplex *B, int ldb, const cuDoubleComplex *B, int ldb,
const cuDoubleComplex *beta, const cuDoubleComplex *beta,
cuDoubleComplex *C, int ldc, cuDoubleComplex *C, int ldc,
CUBLAS_STREAM_PARAM) cublasHandle_t handle )
{ {
cublasZsyr2k(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZsyr2k( handle,
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
B, ldb,
CUBLAS_VALUE(beta), C, ldc );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -20,21 +20,24 @@ ...@@ -20,21 +20,24 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_zsyrk(cham_uplo_t uplo, cham_trans_t trans, int
int n, int k, CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans,
const cuDoubleComplex *alpha, int n, int k,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
const cuDoubleComplex *beta, const cuDoubleComplex *A, int lda,
cuDoubleComplex *C, int ldc, const cuDoubleComplex *beta,
CUBLAS_STREAM_PARAM) cuDoubleComplex *B, int ldb,
cublasHandle_t handle )
{ {
cublasZsyrk(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), C, ldc);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZsyrk( handle,
chameleon_cublas_const(uplo), chameleon_cublas_const(trans),
n, k,
CUBLAS_VALUE(alpha), A, lda,
CUBLAS_VALUE(beta), B, ldb );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -142,7 +142,7 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, ...@@ -142,7 +142,7 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
cuDoubleComplex *A, int LDA, cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB, cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK, int lwork, cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM ) cublasHandle_t handle )
{ {
int m1, n1; int m1, n1;
...@@ -166,14 +166,14 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, ...@@ -166,14 +166,14 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB, CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT, A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork, WORK, lwork,
CUBLAS_STREAM_VALUE ); handle );
} }
/* TT case */ /* TT case */
else if( L == N ) { else if( L == N ) {
CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB, CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT, A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork, WORK, lwork,
CUBLAS_STREAM_VALUE ); handle );
} }
else { else {
cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" ); cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" );
......
...@@ -144,7 +144,7 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, ...@@ -144,7 +144,7 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
cuDoubleComplex *A, int LDA, cuDoubleComplex *A, int LDA,
cuDoubleComplex *B, int LDB, cuDoubleComplex *B, int LDB,
cuDoubleComplex *WORK, int lwork, cuDoubleComplex *WORK, int lwork,
CUBLAS_STREAM_PARAM ) cublasHandle_t handle )
{ {
int m1, n1; int m1, n1;
...@@ -168,14 +168,14 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, ...@@ -168,14 +168,14 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB, CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT, A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork, WORK, lwork,
CUBLAS_STREAM_VALUE ); handle );
} }
/* TT case */ /* TT case */
else if( L == M ) { else if( L == M ) {
CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
A, LDA, B, LDB, V, LDV, T, LDT, A, LDA, B, LDB, V, LDV, T, LDT,
WORK, lwork, WORK, lwork,
CUBLAS_STREAM_VALUE ); handle );
} }
else { else {
cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" ); cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" );
......
...@@ -20,40 +20,27 @@ ...@@ -20,40 +20,27 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_ztrmm( int
cham_side_t side, cham_uplo_t uplo, CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo,
cham_trans_t transa, cham_diag_t diag, cham_trans_t transa, cham_diag_t diag,
int m, int n, int m, int n,
const cuDoubleComplex *alpha, const cuDoubleComplex *alpha,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *A, int lda,
cuDoubleComplex *B, int ldb, cuDoubleComplex *B, int ldb,
CUBLAS_STREAM_PARAM) cublasHandle_t handle )
{ {
cublasStatus_t rc;
#if defined(CHAMELEON_USE_CUBLAS_V2)
rc = cublasZtrmm( handle,
cublasZtrmm( chameleon_cublas_const(side), chameleon_cublas_const(uplo),
CUBLAS_HANDLE chameleon_cublas_const(transa), chameleon_cublas_const(diag),
chameleon_cublas_const(side), chameleon_cublas_const(uplo), m, n,
chameleon_cublas_const(transa), chameleon_cublas_const(diag), CUBLAS_VALUE(alpha), A, lda,
m, n, B, ldb,
CUBLAS_VALUE(alpha), A, lda, B, ldb );
B, ldb,
B, ldb); assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
#else
cublasZtrmm(
CUBLAS_HANDLE
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
chameleon_cublas_const(transa), chameleon_cublas_const(diag),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb);
#endif
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
...@@ -20,22 +20,25 @@ ...@@ -20,22 +20,25 @@
*/ */
#include "cudablas.h" #include "cudablas.h"
int CUDA_ztrsm(cham_side_t side, cham_uplo_t uplo, int
cham_trans_t transa, cham_diag_t diag, CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo,
int m, int n, cham_trans_t transa, cham_diag_t diag,
const cuDoubleComplex *alpha, int m, int n,
const cuDoubleComplex *A, int lda, const cuDoubleComplex *alpha,
cuDoubleComplex *B, int ldb, const cuDoubleComplex *A, int lda,
CUBLAS_STREAM_PARAM) cuDoubleComplex *B, int ldb,
cublasHandle_t handle )
{ {
cublasZtrsm(CUBLAS_HANDLE cublasStatus_t rc;
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
chameleon_cublas_const(transa), chameleon_cublas_const(diag),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb);
assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); rc = cublasZtrsm( handle,
chameleon_cublas_const(side), chameleon_cublas_const(uplo),
chameleon_cublas_const(transa), chameleon_cublas_const(diag),
m, n,
CUBLAS_VALUE(alpha), A, lda,
B, ldb );
assert( rc == CUBLAS_STATUS_SUCCESS );
(void)rc;
return CHAMELEON_SUCCESS; return CHAMELEON_SUCCESS;
} }
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment