diff --git a/CMakeLists.txt b/CMakeLists.txt index b064cf15135af1b366763e1e8a78e9b903f02047..45403fd80b8c375aa15899acfd6854717285fbbd 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -520,12 +520,6 @@ int main(void) { HAVE_FALLTHROUGH ) -# Add option to exploit cublas API v2 -# ----------------------------------- -cmake_dependent_option(CHAMELEON_USE_CUBLAS_V2 - "Enable cublas API v2" ON - "CHAMELEON_USE_CUDA;CHAMELEON_SCHED_STARPU" OFF) - # Fix a problem on Mac OS X when building shared libraries if (${CMAKE_SYSTEM_NAME} MATCHES "Darwin") set(CMAKE_SHARED_LINKER_FLAGS "-undefined dynamic_lookup") diff --git a/control/common.h b/control/common.h index ed2daacc04f7c7389b6fe3816a38a3f9a0dec08c..7eab1f417994b7910263d40e2e43e1d02455e8b0 100644 --- a/control/common.h +++ b/control/common.h @@ -48,12 +48,8 @@ #include <cuda.h> #include <cuda_runtime.h> #include <cuda_runtime_api.h> -#if defined(CHAMELEON_USE_CUBLAS_V2) #include <cublas.h> #include <cublas_v2.h> -#else -#include <cublas.h> -#endif #endif #if defined(CHAMELEON_USE_OPENCL) && !defined(CHAMELEON_SIMULATION) diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index c61bc0496a850dcddd8b23a6150d6ebf06e55ee8..a3473603668f5d755ff2a66ab28903304a786591 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -29,6 +29,7 @@ # ------------------------------------------------------ set(CUDABLAS_SRCS_GENERATED "") set(ZSRC + cuda_zgeadd.c cuda_zgemerge.c cuda_zgemm.c cuda_zhemm.c @@ -52,13 +53,6 @@ set(ZSRC cuda_zunmqrt.c ) -if( CHAMELEON_USE_CUBLAS_V2 ) - set(ZSRC - ${ZSRC} - cuda_zgeadd.c - ) -endif( CHAMELEON_USE_CUBLAS_V2 ) - # Former MAGMA files that are no longer supported # if( CHAMELEON_USE_MAGMA ) # set(ZSRC @@ -83,15 +77,9 @@ precisions_rules_py( set(CUDABLAS_SRCS ${CUDABLAS_SRCS_GENERATED} + cudaglobal.c ) -if (CHAMELEON_USE_CUBLAS_V2) - set(CUDABLAS_SRCS - ${CUDABLAS_SRCS} - cudaglobal.c - ) -endif (CHAMELEON_USE_CUBLAS_V2) - # Force generation of sources # --------------------------- add_custom_target(cudablas_sources ALL SOURCES ${CUDABLAS_SRCS}) diff --git a/cudablas/compute/cuda_zgeadd.c b/cudablas/compute/cuda_zgeadd.c index 1c917cf20f197d4664f79001a02d166c6bbf9aba..78ebaf077bb1004dda107384133da93044f2f263 100644 --- a/cudablas/compute/cuda_zgeadd.c +++ b/cudablas/compute/cuda_zgeadd.c @@ -19,10 +19,6 @@ */ #include "cudablas.h" -#if !defined(CHAMELEON_USE_CUBLAS_V2) -#error "This file requires cublas api v2 support" -#endif - /** ****************************************************************************** * @@ -76,22 +72,24 @@ * @retval <0 if -i, the i-th argument had an illegal value * */ -int CUDA_zgeadd(cham_trans_t trans, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *beta, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int CUDA_zgeadd( cham_trans_t trans, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZgeam(CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), - m, n, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), B, ldb, - B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZgeam( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), + m, n, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb, + B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c index 9f3f8d134b1695e69ce1537c429ef013cbd945de..de144f7904a9340683e0da4a7e2fb59d9d474319 100644 --- a/cudablas/compute/cuda_zgemerge.c +++ b/cudablas/compute/cuda_zgemerge.c @@ -25,11 +25,12 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, - CUBLAS_STREAM_PARAM) + cublasHandle_t handle ) { - int i; const cuDoubleComplex *cola; cuDoubleComplex *colb; + cublasStatus_t rc; + int i; if (M < 0) { return -1; @@ -44,26 +45,23 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, return -7; } - CUBLAS_GET_STREAM; - if (side == ChamLeft){ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cudaMemcpyAsync(colb , cola, - (i+1)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); + rc = cublasZcopy( handle, i+1, cola, 1, colb, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } }else{ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cudaMemcpyAsync(colb+i , cola+i, - (M-i)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); + rc = cublasZcopy( handle, M-i, cola, 1, colb, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } (void)diag; + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgemm.c b/cudablas/compute/cuda_zgemm.c index 7a5900b761ad081181e6b47977e540aa16797b1b..7a9da427a944164fe1b3537bd5191e7d2be88c51 100644 --- a/cudablas/compute/cuda_zgemm.c +++ b/cudablas/compute/cuda_zgemm.c @@ -20,22 +20,26 @@ */ #include "cudablas.h" -int CUDA_zgemm(cham_trans_t transa, cham_trans_t transb, - int m, int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transa), chameleon_cublas_const(transb), - m, n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transa), chameleon_cublas_const(transb), + m, n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgeqrt.c b/cudablas/compute/cuda_zgeqrt.c index 32abd8f90d3a87e7efacfa11264c3c096e18df30..d2d33026f7f290ab9d92b69de406d4a98d13c413 100644 --- a/cudablas/compute/cuda_zgeqrt.c +++ b/cudablas/compute/cuda_zgeqrt.c @@ -21,7 +21,8 @@ #include "cudablas.h" #if defined(CHAMELEON_USE_MAGMA) -int CUDA_zgeqrt( +int +CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, @@ -32,7 +33,7 @@ int CUDA_zgeqrt( magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, - CUstream stream) + CUstream stream ) { #define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) #define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) diff --git a/cudablas/compute/cuda_zhemm.c b/cudablas/compute/cuda_zhemm.c index 26a470b3414aed3777a23e526ccd8e30febe75b4..95201c0357e6d3cdc0efe6ec05e06d892f581764 100644 --- a/cudablas/compute/cuda_zhemm.c +++ b/cudablas/compute/cuda_zhemm.c @@ -20,22 +20,26 @@ */ #include "cudablas.h" -int CUDA_zhemm(cham_side_t side, cham_uplo_t uplo, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zhemm( cham_side_t side, cham_uplo_t uplo, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZhemm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZhemm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zher2k.c b/cudablas/compute/cuda_zher2k.c index 0fd08570d9722bbdcc9a0f419008f343355077a7..bc4e69c839ddcc64354464492dcd76265471762d 100644 --- a/cudablas/compute/cuda_zher2k.c +++ b/cudablas/compute/cuda_zher2k.c @@ -20,23 +20,26 @@ */ #include "cudablas.h" -int CUDA_zher2k(cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const double *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const double *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZher2k(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZher2k( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zherfb.c b/cudablas/compute/cuda_zherfb.c index 13a0d9c24046e459f3acaebe0416604297f368d2..36e1784fb72f355068d549bc5229f8157a3af1ea 100644 --- a/cudablas/compute/cuda_zherfb.c +++ b/cudablas/compute/cuda_zherfb.c @@ -27,7 +27,7 @@ CUDA_zherfb( cham_uplo_t uplo, int n, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { /* Check input arguments */ if ((uplo != ChamUpper) && (uplo != ChamLower)) { @@ -67,21 +67,21 @@ CUDA_zherfb( cham_uplo_t uplo, int n, /* Left */ CUDA_zunmqrt( ChamLeft, ChamConjTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); /* Right */ CUDA_zunmqrt( ChamRight, ChamNoTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { /* Right */ CUDA_zunmlqt( ChamRight, ChamConjTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); /* Left */ CUDA_zunmlqt( ChamLeft, ChamNoTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); } return 0; } diff --git a/cudablas/compute/cuda_zherk.c b/cudablas/compute/cuda_zherk.c index cd0d7ff6441932487e13e5ac4fc6ac9f59d3a12a..3ac3b66f12430e91fe50f0c1bef7e50888eae2c6 100644 --- a/cudablas/compute/cuda_zherk.c +++ b/cudablas/compute/cuda_zherk.c @@ -20,21 +20,24 @@ */ #include "cudablas.h" -int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const double *alpha, - const cuDoubleComplex *A, int lda, - const double *beta, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const double *alpha, + const cuDoubleComplex *A, int lda, + const double *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZherk( CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZherk( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index 3cc573ff59c813e7f9448ac847684e84de4a3f45..49a9baaa3622fbfbeed8f8ff173cc8d118c6c945 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -30,7 +30,7 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -42,8 +42,9 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, double mzone = -1.0; #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ - cham_trans_t transT, notransV, transV; - cham_uplo_t uplo; + cublasStatus_t rc; + cham_trans_t transT, notransV, transV; + cham_uplo_t uplo; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -103,54 +104,58 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, // Comments assume H C. When forming H^H C, T gets transposed via transT. // W = C^H V - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), - N, K, M, - CUBLAS_SADDR(zone), C, LDC, - V, LDV, - CUBLAS_SADDR(zzero), WORK, LDWORK ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), + N, K, M, + CUBLAS_SADDR(zone), C, LDC, + V, LDV, + CUBLAS_SADDR(zzero), WORK, LDWORK ); + assert( rc == CUBLAS_STATUS_SUCCESS ); // W = W T^H = C^H V T^H CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit, N, K, &zone, T, LDT, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), - M, N, K, - CUBLAS_SADDR(mzone), V, LDV, - WORK, LDWORK, - CUBLAS_SADDR(zone), C, LDC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), + M, N, K, + CUBLAS_SADDR(mzone), V, LDV, + WORK, LDWORK, + CUBLAS_SADDR(zone), C, LDC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { // Form C H or C H^H // Comments assume C H. When forming C H^H, T gets transposed via trans. // W = C V - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV), - M, K, N, - CUBLAS_SADDR(zone), C, LDC, - V, LDV, - CUBLAS_SADDR(zzero), WORK, LDWORK ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV), + M, K, N, + CUBLAS_SADDR(zone), C, LDC, + V, LDV, + CUBLAS_SADDR(zzero), WORK, LDWORK ); + assert( rc == CUBLAS_STATUS_SUCCESS ); // W = W T = C V T CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit, M, K, &zone, T, LDT, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV), - M, N, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, - V, LDV, - CUBLAS_SADDR(zone), C, LDC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV), + M, N, K, + CUBLAS_SADDR(mzone), WORK, LDWORK, + V, LDV, + CUBLAS_SADDR(zone), C, LDC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 419e2b3e5bd1f46eafd59b2f38255fe53afda9c5..476c8b7faef0e0bf6b3a6bb387b15d130682c9d3 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -146,7 +146,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -159,14 +159,12 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ cuDoubleComplex *workW, *workC, *workV; - int ldW, ldC, ldV; - int j; - cham_trans_t transW; - cham_trans_t transA2; - int wssize = 0; - int wrsize = 0; - - CUBLAS_GET_STREAM; + cublasStatus_t rc; + cudaStream_t stream; + int j, ldW, ldC, ldV; + cham_trans_t transW, transA2; + int wssize = 0; + int wrsize = 0; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -219,6 +217,8 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return CHAMELEON_SUCCESS; } + cublasGetStream( handle, &stream ); + if (direct == ChamDirForward) { if (side == ChamLeft) { @@ -307,12 +307,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), - K, N1, M2, - CUBLAS_SADDR(zone), workV /* M2*K */, ldV, - A2 /* M2*N2 */, LDA2, - CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), + K, N1, M2, + CUBLAS_SADDR(zone), workV /* M2*K */, ldV, + A2 /* M2*N2 */, LDA2, + CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + assert( rc == CUBLAS_STATUS_SUCCESS ); if ( workC == NULL ) { /* W = op(T) * W */ @@ -320,48 +321,52 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, K, N2, &zone, T, LDT, workW, ldW, - CUBLAS_STREAM_VALUE ); + handle ); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { - cublasZaxpy( CUBLAS_HANDLE - K, CUBLAS_SADDR(mzone), - workW + ldW * j, 1, - A1 + LDA1 * j, 1 ); + rc = cublasZaxpy( handle, + K, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } /* A2 = A2 - op(V) * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, - workW /* K * N2 */, ldW, - CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, + workW /* K * N2 */, ldW, + CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { /* Wc = V * op(T) */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(trans), - M2, K, K, - CUBLAS_SADDR(zone), workV, ldV, - T, LDT, - CUBLAS_SADDR(zzero), workC, ldC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transA2), chameleon_cublas_const(trans), + M2, K, K, + CUBLAS_SADDR(zone), workV, ldV, + T, LDT, + CUBLAS_SADDR(zzero), workC, ldC ); /* A1 = A1 - opt(T) * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), - K, N1, K, - CUBLAS_SADDR(mzone), T, LDT, - workW, ldW, - CUBLAS_SADDR(zone), A1, LDA1 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), + K, N1, K, + CUBLAS_SADDR(mzone), T, LDT, + workW, ldW, + CUBLAS_SADDR(zone), A1, LDA1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* A2 = A2 - Wc * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workC, ldC, - workW, ldW, - CUBLAS_SADDR(zone), A2, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workC, ldC, + workW, ldW, + CUBLAS_SADDR(zone), A2, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } else { @@ -450,12 +455,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, transW = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; transA2 = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), - M1, K, N2, - CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, - workV /* K *N2 */, ldV, - CUBLAS_SADDR(zone), workW /* M1*K */, ldW); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), + M1, K, N2, + CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), workW /* M1*K */, ldW); + assert( rc == CUBLAS_STATUS_SUCCESS ); if ( workC == NULL ) { /* W = W * op(T) */ @@ -463,48 +469,53 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, M2, K, &zone, T, LDT, workW, ldW, - CUBLAS_STREAM_VALUE ); + handle ); /* A1 = A1 - W = A1 - W * op(T) */ for(j = 0; j < K; j++) { - cublasZaxpy( CUBLAS_HANDLE - M1, CUBLAS_SADDR(mzone), - workW + ldW * j, 1, - A1 + LDA1 * j, 1 ); + rc = cublasZaxpy( handle, + M1, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } /* A2 = A2 - W * op(V) */ - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), - M2, N2, K, - CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, - workV /* K *N2 */, ldV, - CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), + M2, N2, K, + CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { /* A1 = A1 - W * opt(T) */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), - M1, K, K, - CUBLAS_SADDR(mzone), workW, ldW, - T, LDT, - CUBLAS_SADDR(zone), A1, LDA1 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), + M1, K, K, + CUBLAS_SADDR(mzone), workW, ldW, + T, LDT, + CUBLAS_SADDR(zone), A1, LDA1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* Wc = op(T) * V */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(transA2), - K, N2, K, - CUBLAS_SADDR(zone), T, LDT, - workV, ldV, - CUBLAS_SADDR(zzero), workC, ldC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(transA2), + K, N2, K, + CUBLAS_SADDR(zone), T, LDT, + workV, ldV, + CUBLAS_SADDR(zzero), workC, ldC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* A2 = A2 - W * Wc */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workW, ldW, - workC, ldC, - CUBLAS_SADDR(zone), A2, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workW, ldW, + workC, ldC, + CUBLAS_SADDR(zone), A2, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } } diff --git a/cudablas/compute/cuda_zsymm.c b/cudablas/compute/cuda_zsymm.c index 2ed61af73945442e9586c0c2b5a98fb909aecd2a..3dcfa14eb9b0eecae32a4d2df75e54382e0e4038 100644 --- a/cudablas/compute/cuda_zsymm.c +++ b/cudablas/compute/cuda_zsymm.c @@ -20,23 +20,26 @@ */ #include "cudablas.h" -int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM ) +int +CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZsymm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsymm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zsyr2k.c b/cudablas/compute/cuda_zsyr2k.c index 5c40f1d54fa8034218c9ccd9afd6c409e493186b..9bd078b38e7afd1c4494d7f668a8e3d4863361cc 100644 --- a/cudablas/compute/cuda_zsyr2k.c +++ b/cudablas/compute/cuda_zsyr2k.c @@ -20,24 +20,26 @@ */ #include "cudablas.h" -int CUDA_zsyr2k( - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZsyr2k(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsyr2k( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zsyrk.c b/cudablas/compute/cuda_zsyrk.c index b2a61353c747a430b887a661f588f502d713fe79..cdf966546a2a2d7229766146c7000c5c34c47e29 100644 --- a/cudablas/compute/cuda_zsyrk.c +++ b/cudablas/compute/cuda_zsyrk.c @@ -20,21 +20,24 @@ */ #include "cudablas.h" -int CUDA_zsyrk(cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZsyrk(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsyrk( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c index c10a2cb01f817e8e493e140545ed3661877b8be3..58c613a756fccccde789eb01f4b7a9be5f62fe52 100644 --- a/cudablas/compute/cuda_ztpmlqt.c +++ b/cudablas/compute/cuda_ztpmlqt.c @@ -142,7 +142,7 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { int m1, n1; @@ -166,14 +166,14 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } /* TT case */ else if( L == N ) { CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" ); diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c index bfbc286335af56acad55dab892ff763ab0fc7af5..ddcbc1b522e8b6968e6c6ffca000907c6951ae0a 100644 --- a/cudablas/compute/cuda_ztpmqrt.c +++ b/cudablas/compute/cuda_ztpmqrt.c @@ -144,7 +144,7 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { int m1, n1; @@ -168,14 +168,14 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } /* TT case */ else if( L == M ) { CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" ); diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c index a054413e7619e64d215ad4c68c1ef70294394fd5..f177a09312030ef9497570120ab584c6cdae6b05 100644 --- a/cudablas/compute/cuda_ztrmm.c +++ b/cudablas/compute/cuda_ztrmm.c @@ -20,40 +20,27 @@ */ #include "cudablas.h" -int CUDA_ztrmm( - cham_side_t side, cham_uplo_t uplo, - cham_trans_t transa, cham_diag_t diag, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, + cham_trans_t transa, cham_diag_t diag, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - -#if defined(CHAMELEON_USE_CUBLAS_V2) - - cublasZtrmm( - CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - chameleon_cublas_const(transa), chameleon_cublas_const(diag), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - B, ldb); - -#else - - cublasZtrmm( - CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - chameleon_cublas_const(transa), chameleon_cublas_const(diag), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb); -#endif - - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); - + cublasStatus_t rc; + + rc = cublasZtrmm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + chameleon_cublas_const(transa), chameleon_cublas_const(diag), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + B, ldb ); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztrsm.c b/cudablas/compute/cuda_ztrsm.c index 5f63d2464c679c239393040f66bb8f5ef3c1df9b..32d8c83c9d4f80bde4a76ddc93fb724246ef932e 100644 --- a/cudablas/compute/cuda_ztrsm.c +++ b/cudablas/compute/cuda_ztrsm.c @@ -20,22 +20,25 @@ */ #include "cudablas.h" -int CUDA_ztrsm(cham_side_t side, cham_uplo_t uplo, - cham_trans_t transa, cham_diag_t diag, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, + cham_trans_t transa, cham_diag_t diag, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZtrsm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - chameleon_cublas_const(transa), chameleon_cublas_const(diag), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZtrsm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + chameleon_cublas_const(transa), chameleon_cublas_const(diag), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c index 820518d5dcd7110d16038ae0cc8a9327c6c82029..63918f4b50c889448027d22534de2c4b31a0f0cc 100644 --- a/cudablas/compute/cuda_ztsmlq.c +++ b/cudablas/compute/cuda_ztsmlq.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_ztsmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int kb; @@ -133,7 +133,7 @@ int CUDA_ztsmlq( A2, LDA2, V + i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c index e3e576caf34576e5945df609fbc2418c8db4295e..61b05597243418c988b5cea60e914fc35773881f 100644 --- a/cudablas/compute/cuda_ztsmqr.c +++ b/cudablas/compute/cuda_ztsmqr.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_ztsmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int NQ; @@ -135,7 +135,7 @@ int CUDA_ztsmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c index c86f6611f34ab556c3bbbcdd7476937b6de13a34..4e6ac068a494b1afbf0ace0ce03b8595c9aa0fb1 100644 --- a/cudablas/compute/cuda_zttmlq.c +++ b/cudablas/compute/cuda_zttmlq.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_zttmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_zttmlq( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int kb, l; @@ -134,7 +134,7 @@ int CUDA_zttmlq( A2, LDA2, V + i, LDV, T + LDT * i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c index 8b9036d5373213a51f25947a2b6f4cadec68ae3f..bd2a17569b35ca7930a0a262ee3a72d7bdf69ba3 100644 --- a/cudablas/compute/cuda_zttmqr.c +++ b/cudablas/compute/cuda_zttmqr.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_zttmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_zttmqr( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int NQ; @@ -135,7 +135,7 @@ int CUDA_zttmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c index 5689bfa016095632377796c1d1b249f8d0550275..47a3b19806b6fd4dc4acab4db331e3ee734e1652 100644 --- a/cudablas/compute/cuda_zunmlqt.c +++ b/cudablas/compute/cuda_zunmlqt.c @@ -21,13 +21,13 @@ #include "cudablas.h" int -CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, - int M, int N, int K, int IB, - const cuDoubleComplex *A, int LDA, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zunmlqt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int IB, + const cuDoubleComplex *A, int LDA, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + cublasHandle_t handle ) { int i, kb; int i1, i3; @@ -122,7 +122,7 @@ CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, A + LDA * i + i, LDA, T + LDT * i, LDT, C + LDC * jc + ic, LDC, - WORK, LDWORK, CUBLAS_STREAM_VALUE); + WORK, LDWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c index a16d10a4792f0e6a472ccf2f77a53607a4d37a29..3864bd5ffe061a38f03531da9a544c79ee4316cc 100644 --- a/cudablas/compute/cuda_zunmqrt.c +++ b/cudablas/compute/cuda_zunmqrt.c @@ -21,13 +21,13 @@ #include "cudablas.h" int -CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, - int M, int N, int K, int IB, - const cuDoubleComplex *A, int LDA, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zunmqrt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int IB, + const cuDoubleComplex *A, int LDA, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + cublasHandle_t handle ) { int i, kb; int i1, i3; @@ -116,7 +116,7 @@ CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, T + LDT * i, LDT, C + LDC * jc + ic, LDC, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); } return CHAMELEON_SUCCESS; diff --git a/cudablas/eztrace_module/cudablas_eztrace_module b/cudablas/eztrace_module/cudablas_eztrace_module index c631193b1fcedd7597a645dad9d208b29c1faed8..bdf0a7262531351393e7f4d8260e6026fc6949e3 100644 --- a/cudablas/eztrace_module/cudablas_eztrace_module +++ b/cudablas/eztrace_module/cudablas_eztrace_module @@ -21,15 +21,6 @@ int CUDA_cgemerge( void *A, void* LDA, void *B, void* LDB, void* stream); -int CUDA_cgemm_V2( - void* transa, void* transb, - int m, int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_cgemm( void* transa, void* transb, int m, int n, int k, @@ -69,15 +60,6 @@ int CUDA_cgetrf_nopiv( void* m, void* n, void *dA, void* ldda, void* info); -int CUDA_chemm_V2( - void* side, void* uplo, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_chemm( void* side, void* uplo, int m, int n, @@ -87,15 +69,6 @@ int CUDA_chemm( void *beta, void *C, int ldc, void* stream); -int CUDA_cher2k_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - float *beta, - void *C, int ldc, - void* stream); int CUDA_cher2k( void* uplo, void* trans, int n, int k, @@ -105,14 +78,6 @@ int CUDA_cher2k( float *beta, void *C, int ldc, void* stream); -int CUDA_cherk_V2( - void* uplo, void* trans, - int n, int k, - float *alpha, - const void *A, int lda, - float *beta, - void *B, int ldb, - void* stream); int CUDA_cherk( void* uplo, void* trans, int n, int k, @@ -148,15 +113,6 @@ int CUDA_cssssm( void *dL1, void* lddl1, void *dL2, void* lddl2, void* *IPIV, void* info); -int CUDA_csymm_V2( - void* side, void* uplo, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_csymm( void* side, void* uplo, int m, int n, @@ -166,15 +122,6 @@ int CUDA_csymm( void *beta, void *C, int ldc, void* stream); -int CUDA_csyr2k_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_csyr2k( void* uplo, void* trans, int n, int k, @@ -184,14 +131,6 @@ int CUDA_csyr2k( void *beta, void *C, int ldc, void* stream); -int CUDA_csyrk_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - void *beta, - void *C, int ldc, - void* stream); int CUDA_csyrk( void* uplo, void* trans, int n, int k, @@ -200,15 +139,6 @@ int CUDA_csyrk( void *beta, void *C, int ldc, void* stream); -int CUDA_ctrmm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *C, int ldc, - void* stream); int CUDA_ctrmm( void* side, void* uplo, void* transa, void* diag, @@ -217,14 +147,6 @@ int CUDA_ctrmm( const void *A, int lda, void *B, int ldb, void* stream); -int CUDA_ctrsm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - const void *alpha, - const void *A, int lda, - void *B, int ldb, - void* stream); int CUDA_ctrsm( void* side, void* uplo, void* transa, void* diag, @@ -352,15 +274,6 @@ int CUDA_dgemerge( double *A, void* LDA, double *B, void* LDB, void* stream); -int CUDA_dgemm_V2( - void* transa, void* transb, - int m, int n, int k, - double *alpha, - const double *A, int lda, - const double *B, int ldb, - double *beta, - double *C, int ldc, - void* stream); int CUDA_dgemm( void* transa, void* transb, int m, int n, int k, @@ -400,15 +313,6 @@ int CUDA_dgetrf_nopiv( void* m, void* n, double *dA, void* ldda, void* info); -int CUDA_dsymm_V2( - void* side, void* uplo, - int m, int n, - double *alpha, - const double *A, int lda, - const double *B, int ldb, - double *beta, - double *C, int ldc, - void* stream); int CUDA_dsymm( void* side, void* uplo, int m, int n, @@ -418,15 +322,6 @@ int CUDA_dsymm( double *beta, double *C, int ldc, void* stream); -int CUDA_dsyr2k_V2( - void* uplo, void* trans, - int n, int k, - double *alpha, - const double *A, int lda, - const double *B, int ldb, - double *beta, - double *C, int ldc, - void* stream); int CUDA_dsyr2k( void* uplo, void* trans, int n, int k, @@ -436,14 +331,6 @@ int CUDA_dsyr2k( double *beta, double *C, int ldc, void* stream); -int CUDA_dsyrk_V2( - void* uplo, void* trans, - int n, int k, - double *alpha, - const double *A, int lda, - double *beta, - double *B, int ldb, - void* stream); int CUDA_dsyrk( void* uplo, void* trans, int n, int k, @@ -479,15 +366,6 @@ int CUDA_dssssm( double *dL1, void* lddl1, double *dL2, void* lddl2, void* *IPIV, void* info); -int CUDA_dtrmm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - double *alpha, - const double *A, int lda, - const double *B, int ldb, - double *C, int ldc, - void* stream); int CUDA_dtrmm( void* side, void* uplo, void* transa, void* diag, @@ -496,14 +374,6 @@ int CUDA_dtrmm( const double *A, int lda, double *B, int ldb, void* stream); -int CUDA_dtrsm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - const double *alpha, - const double *A, int lda, - double *B, int ldb, - void* stream); int CUDA_dtrsm( void* side, void* uplo, void* transa, void* diag, @@ -631,15 +501,6 @@ int CUDA_sgemerge( float *A, void* LDA, float *B, void* LDB, void* stream); -int CUDA_sgemm_V2( - void* transa, void* transb, - int m, int n, int k, - float *alpha, - const float *A, int lda, - const float *B, int ldb, - float *beta, - float *C, int ldc, - void* stream); int CUDA_sgemm( void* transa, void* transb, int m, int n, int k, @@ -679,15 +540,6 @@ int CUDA_sgetrf_nopiv( void* m, void* n, float *dA, void* ldda, void* info); -int CUDA_ssymm_V2( - void* side, void* uplo, - int m, int n, - float *alpha, - const float *A, int lda, - const float *B, int ldb, - float *beta, - float *C, int ldc, - void* stream); int CUDA_ssymm( void* side, void* uplo, int m, int n, @@ -697,15 +549,6 @@ int CUDA_ssymm( float *beta, float *C, int ldc, void* stream); -int CUDA_ssyr2k_V2( - void* uplo, void* trans, - int n, int k, - float *alpha, - const float *A, int lda, - const float *B, int ldb, - float *beta, - float *C, int ldc, - void* stream); int CUDA_ssyr2k( void* uplo, void* trans, int n, int k, @@ -715,14 +558,6 @@ int CUDA_ssyr2k( float *beta, float *C, int ldc, void* stream); -int CUDA_ssyrk_V2( - void* uplo, void* trans, - int n, int k, - float *alpha, - const float *A, int lda, - float *beta, - float *B, int ldb, - void* stream); int CUDA_ssyrk( void* uplo, void* trans, int n, int k, @@ -758,15 +593,6 @@ int CUDA_sssssm( float *dL1, void* lddl1, float *dL2, void* lddl2, void* *IPIV, void* info); -int CUDA_strmm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - float *alpha, - const float *A, int lda, - const float *B, int ldb, - float *C, int ldc, - void* stream); int CUDA_strmm( void* side, void* uplo, void* transa, void* diag, @@ -775,14 +601,6 @@ int CUDA_strmm( const float *A, int lda, float *B, int ldb, void* stream); -int CUDA_strsm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - const float *alpha, - const float *A, int lda, - float *B, int ldb, - void* stream); int CUDA_strsm( void* side, void* uplo, void* transa, void* diag, @@ -910,15 +728,6 @@ int CUDA_zgemerge( void *A, void* LDA, void *B, void* LDB, void* stream); -int CUDA_zgemm_V2( - void* transa, void* transb, - int m, int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_zgemm( void* transa, void* transb, int m, int n, int k, @@ -958,15 +767,6 @@ int CUDA_zgetrf_nopiv( void* m, void* n, void *dA, void* ldda, void* info); -int CUDA_zhemm_V2( - void* side, void* uplo, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_zhemm( void* side, void* uplo, int m, int n, @@ -976,15 +776,6 @@ int CUDA_zhemm( void *beta, void *C, int ldc, void* stream); -int CUDA_zher2k_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - double *beta, - void *C, int ldc, - void* stream); int CUDA_zher2k( void* uplo, void* trans, int n, int k, @@ -994,14 +785,6 @@ int CUDA_zher2k( double *beta, void *C, int ldc, void* stream); -int CUDA_zherk_V2( - void* uplo, void* trans, - int n, int k, - double *alpha, - const void *A, int lda, - double *beta, - void *B, int ldb, - void* stream); int CUDA_zherk( void* uplo, void* trans, int n, int k, @@ -1037,15 +820,6 @@ int CUDA_zssssm( void *dL1, void* lddl1, void *dL2, void* lddl2, void* *IPIV, void* info); -int CUDA_zsymm_V2( - void* side, void* uplo, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_zsymm( void* side, void* uplo, int m, int n, @@ -1055,15 +829,6 @@ int CUDA_zsymm( void *beta, void *C, int ldc, void* stream); -int CUDA_zsyr2k_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *beta, - void *C, int ldc, - void* stream); int CUDA_zsyr2k( void* uplo, void* trans, int n, int k, @@ -1073,14 +838,6 @@ int CUDA_zsyr2k( void *beta, void *C, int ldc, void* stream); -int CUDA_zsyrk_V2( - void* uplo, void* trans, - int n, int k, - void *alpha, - const void *A, int lda, - void *beta, - void *C, int ldc, - void* stream); int CUDA_zsyrk( void* uplo, void* trans, int n, int k, @@ -1089,15 +846,6 @@ int CUDA_zsyrk( void *beta, void *C, int ldc, void* stream); -int CUDA_ztrmm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - void *alpha, - const void *A, int lda, - const void *B, int ldb, - void *C, int ldc, - void* stream); int CUDA_ztrmm( void* side, void* uplo, void* transa, void* diag, @@ -1106,14 +854,6 @@ int CUDA_ztrmm( const void *A, int lda, void *B, int ldb, void* stream); -int CUDA_ztrsm_V2( - void* side, void* uplo, - void* transa, void* diag, - int m, int n, - const void *alpha, - const void *A, int lda, - void *B, int ldb, - void* stream); int CUDA_ztrsm( void* side, void* uplo, void* transa, void* diag, diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h index ad581009ff75fe4b7be9eec660d069d9d8e36581..066680e640100f1153e228c1db339b8bce726d4a 100644 --- a/cudablas/include/cudablas.h +++ b/cudablas/include/cudablas.h @@ -36,31 +36,11 @@ #include <cuda.h> #include <cuComplex.h> -#if defined(CHAMELEON_USE_CUBLAS_V2) - #include <cublas.h> #include <cublas_v2.h> -#define CUBLAS_STREAM_PARAM cublasHandle_t handle -#define CUBLAS_STREAM_VALUE handle -#define CUBLAS_HANDLE handle, #define CUBLAS_SADDR(_a_) (&(_a_)) #define CUBLAS_VALUE(_a_) (_a_) -#define CUBLAS_GET_STREAM \ - cudaStream_t stream; \ - cublasGetStream( handle, &stream ) - -#else - -#include <cublas.h> -#define CUBLAS_STREAM_PARAM cudaStream_t stream -#define CUBLAS_STREAM_VALUE stream -#define CUBLAS_HANDLE -#define CUBLAS_SADDR(_a_) (_a_) -#define CUBLAS_VALUE(_a_) (*(_a_)) -#define CUBLAS_GET_STREAM - -#endif /* defined(CHAMELEON_USE_CUBLAS_V2) */ /** * CHAMELEON types and constants @@ -95,12 +75,7 @@ extern char *chameleon_lapack_constants[]; #define chameleon_lapack_const(chameleon_const) chameleon_lapack_constants[chameleon_const][0] extern int chameleon_cublas_constants[]; - -#if defined(CHAMELEON_USE_CUBLAS_V2) #define chameleon_cublas_const(chameleon_const) chameleon_cublas_constants[chameleon_const] -#else -#define chameleon_cublas_const(chameleon_const) chameleon_lapack_constants[chameleon_const][0] -#endif END_C_DECLS diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h index 2d22e33ca04a2215f76b477ca2a90c4b6c4cb96f..9eeebdf174e8adafbadc21b16acce1405d271fc8 100644 --- a/cudablas/include/cudablas/cudablas_z.h +++ b/cudablas/include/cudablas/cudablas_z.h @@ -24,27 +24,27 @@ /** * Declarations of cuda kernels - alphabetical order */ -int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, CUBLAS_STREAM_PARAM ); -int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_zhemm( cham_side_t side, cham_uplo_t uplo, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const double *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM ); -int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const double *alpha, const cuDoubleComplex *A, int lda, const double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM ); -int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); -int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM ); -int CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); -int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); -int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM ); +int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); +int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_zhemm( cham_side_t side, cham_uplo_t uplo, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const double *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, cublasHandle_t handle ); +int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const double *alpha, const cuDoubleComplex *A, int lda, const double *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, cublasHandle_t handle ); +int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ); +int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); +int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, cublasHandle_t handle ); +int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, cublasHandle_t handle ); +int CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ); +int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ); +int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ); +int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, cublasHandle_t handle ); +int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, cublasHandle_t handle ); +int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A, int LDA, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, cublasHandle_t handle ); #endif /* _cudablas_z_h_ */ diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in index 2b7e3e4bc644f17cf7980463cfda09363a9ddc54..9f62ff9852df02bca2127f193d7cdef28bca111a 100644 --- a/include/chameleon/config.h.in +++ b/include/chameleon/config.h.in @@ -51,7 +51,6 @@ /* GPU Support */ #cmakedefine CHAMELEON_USE_CUDA #cmakedefine CHAMELEON_USE_CUBLAS -#cmakedefine CHAMELEON_USE_CUBLAS_V2 /* Hmat-oss */ #cmakedefine CHAMELEON_USE_HMAT diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index 7d7a0045a0d73d765ad9c70621143ad3086f6685..9cfe15c08f9ab929ac9e22cc83648795d0dfffe0 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -44,9 +44,10 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) return; } -#ifdef CHAMELEON_USE_CUBLAS_V2 +#ifdef CHAMELEON_USE_CUBLAS static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_trans_t trans; int M; int N; @@ -59,28 +60,20 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta ); - RUNTIME_getStream( stream ); - - CUDA_zgeadd( - trans, - M, N, - &alpha, tileA->mat, tileA->ld, - &beta, tileB->mat, tileB->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + CUDA_zgeadd( trans, M, N, + &alpha, tileA->mat, tileA->ld, + &beta, tileB->mat, tileB->ld, + handle ); return; } -#endif /* defined(CHAMELEON_USE_CUBLAS_V2) */ +#endif /* defined(CHAMELEON_USE_CUBLAS) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS_V2) +#if defined(CHAMELEON_USE_CUBLAS) CODELETS(zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) #else CODELETS_CPU(zgeadd, cl_zgeadd_cpu_func) diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index a3f584f55b350803cbcadb0648c3bed16930c0bd..38d6337bb5e9d9c4d4d956e3d169f938b8de077a 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -64,6 +64,7 @@ static void cl_zgemm_cuda_func( void *descr[], void *cl_arg ) { struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -72,8 +73,6 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - RUNTIME_getStream( stream ); - assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); assert( tileC->format & CHAMELEON_TILE_FULLRANK ); @@ -86,11 +85,7 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) tileB->mat, tileB->ld, (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + handle ); return; } diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c index 89fdd39f2070967f7dae22ffa0a4491e38de4534..3aedbfa4149a7dc144483f4ca91142b14b5554f6 100644 --- a/runtime/starpu/codelets/codelet_zgemv.c +++ b/runtime/starpu/codelets/codelet_zgemv.c @@ -47,6 +47,7 @@ static void cl_zgemv_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) & 0 static void cl_zgemv_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_trans_t transA; cham_trans_t transB; int m; @@ -64,20 +65,13 @@ static void cl_zgemv_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); - RUNTIME_getStream( stream ); - CUDA_zgemv( transA, transB, m, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - + handle ); return; } #endif /* defined(CHAMELEON_USE_CUDA) */ diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 2b85fbdaeed51374262376fd224150ad6ea3f980..20d0e352b33b1c4a74682622b17d136bbdd8fc70 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -54,6 +54,7 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_uplo_t uplo; int M; @@ -70,21 +71,13 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zhemm( side, uplo, M, N, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index a3d12b1582ad9fb1abbe912eda14ef62a846e300..1f9585e58edb3385f8451d87d7585ff049798669 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -51,6 +51,7 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; cham_trans_t trans; int n; @@ -67,20 +68,12 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zher2k( uplo, trans, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c index 4cc999337187e52c95dcae880305305f4ef9f685..af9b2a256eda483f66e5e2504ff581767fd1e3f6 100644 --- a/runtime/starpu/codelets/codelet_zherfb.c +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -47,6 +47,7 @@ static void cl_zherfb_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; int n, k, ib, nb; CHAM_tile_t *tileA; @@ -55,8 +56,6 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) CHAM_tile_t *tileW; int ldW; - RUNTIME_getStream(stream); - tileA = cti_interface_get(descr[0]); tileT = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); @@ -68,11 +67,7 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 008d1822dafe9d56c0961993d56d4fea9c6c0645..d877cda36e57b4f7b7cfd01af167448f14ad4056 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -57,6 +57,7 @@ cl_zherk_cpu_func(void *descr[], void *cl_arg) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; @@ -64,21 +65,13 @@ cl_zherk_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_zherk( clargs->uplo, clargs->trans, clargs->n, clargs->k, &(clargs->alpha), tileA->mat, tileA->ld, &(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index e03e56be3c5a478db1293572ea5831c9da96414f..8026b32fe44ead259d0c602249b289e23eb49029 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -54,6 +54,7 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_uplo_t uplo; int M; @@ -70,21 +71,13 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zsymm( side, uplo, M, N, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 5a5c5478751575d320f9612d00920510f0de20db..c020938bf091a75347f871162dd50c9f0521986b 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -51,6 +51,7 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; cham_trans_t trans; int n; @@ -67,20 +68,12 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zsyr2k( uplo, trans, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index ee6f6aa538eafae32c2b1e6f57b7dc88dd4402db..1d09960c91ead88703d24e01f3d9701d5d7ab0ba 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -57,6 +57,7 @@ cl_zsyrk_cpu_func(void *descr[], void *cl_arg) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; @@ -64,21 +65,13 @@ cl_zsyrk_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_zsyrk( clargs->uplo, clargs->trans, clargs->n, clargs->k, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 076144bc9749dd74d1adde6e19cb7f546de5dcb7..ce1e70b15fdc1c8161460d389c0373d773025e1b 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -51,6 +51,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int M; @@ -73,19 +74,13 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - RUNTIME_getStream(stream); - CUDA_ztpmlqt( side, trans, M, N, K, L, ib, tileV->mat, tileV->ld, tileT->mat, tileT->ld, tileA->mat, tileA->ld, tileB->mat, tileB->ld, - tileW->mat, lwork, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, lwork, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 66461e6c45c5773c4bf4419389fb3423a53b6112..a8870a36f72df20e459142a9821ebfad3d71a2c4 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -51,6 +51,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int M; @@ -73,19 +74,13 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - RUNTIME_getStream(stream); - CUDA_ztpmqrt( side, trans, M, N, K, L, ib, tileV->mat, tileV->ld, tileT->mat, tileT->ld, tileA->mat, tileA->ld, tileB->mat, tileB->ld, - tileW->mat, lwork, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, lwork, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 1c6a2901c97437f4699d9a1045b2d0d0f0af64d4..0a34ea60ac0c6646b57b50da8d7af071b23712a6 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -57,6 +57,7 @@ cl_ztrmm_cpu_func(void *descr[], void *cl_arg) static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; @@ -64,21 +65,13 @@ cl_ztrmm_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_ztrmm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, clargs->m, clargs->n, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 423e95fc943f02178d184285cbbb0acc5a8dc5ac..debe56add9e9d872fb34a3b8645cae7e18127d4f 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -59,27 +59,20 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_ztrsm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, clargs->m, clargs->n, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index e0d55765aaa4479c7657426892c1b148f9d4ecb8..19efea82b7e55d3326d9bdc05897f0e58f4a2033 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -59,6 +59,7 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int m; @@ -78,18 +79,12 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - RUNTIME_getStream(stream); - CUDA_zunmlqt( side, trans, m, n, k, ib, tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 58cad3e8aec870a730452afea146d60a3d78c160..31489663daed951ce33761abe1f3c391b29d3a33 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -58,6 +58,7 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int m; @@ -77,18 +78,12 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - RUNTIME_getStream(stream); - CUDA_zunmqrt( side, trans, m, n, k, ib, tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 3156108174b12ea2f34637d24a2bc7b66458a192..15391b62d907de4a3be828049f381893717249b1 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -63,11 +63,9 @@ #include <cublas.h> #include <starpu_cublas.h> -#if defined(CHAMELEON_USE_CUBLAS_V2) #include <cublas_v2.h> #include <starpu_cublas_v2.h> #endif -#endif #if defined(CHAMELEON_SIMULATION) # if !defined(STARPU_SIMGRID) @@ -127,19 +125,6 @@ typedef struct starpu_option_request_s { #endif -/* - * cuBlasAPI v2 - StarPU enable the support for cublas handle - */ -#if defined(CHAMELEON_USE_CUDA) && defined(CHAMELEON_USE_CUBLAS_V2) -#define RUNTIME_getStream(_stream_) \ - cublasHandle_t _stream_ = starpu_cublas_get_local_handle(); -#else -#define RUNTIME_getStream(_stream_) \ - cudaStream_t _stream_ = starpu_cuda_get_local_stream(); \ - cublasSetKernelStream( stream ); - -#endif - /* * Enable codelets names */ diff --git a/runtime/starpu/interface/cham_tile_interface.c b/runtime/starpu/interface/cham_tile_interface.c index a237d2cfe888e87a3ddbcf88af3dca344afdfc1c..d9f460b64dfb0e7cfc6b3db2f46d533066e6d094 100644 --- a/runtime/starpu/interface/cham_tile_interface.c +++ b/runtime/starpu/interface/cham_tile_interface.c @@ -119,7 +119,7 @@ cti_register_data_handle( starpu_data_handle_t handle, void *data_interface ) { starpu_cham_tile_interface_t *cham_tile_interface = (starpu_cham_tile_interface_t *) data_interface; - unsigned node; + int node; for (node = 0; node < STARPU_MAXNODES; node++) { @@ -573,10 +573,10 @@ static int cti_copy_any_to_any( void *src_interface, unsigned src_node, starpu_cham_tile_interface_t *cham_tile_src = (starpu_cham_tile_interface_t *) src_interface; starpu_cham_tile_interface_t *cham_tile_dst = (starpu_cham_tile_interface_t *) dst_interface; size_t elemsize = CHAMELEON_Element_Size( cham_tile_src->flttype ); - size_t m = cham_tile_src->tile.m; - size_t n = cham_tile_src->tile.n; - size_t ld_src = cham_tile_src->tile.ld; - size_t ld_dst = cham_tile_dst->tile.ld; + size_t m = (size_t)(cham_tile_src->tile.m); + size_t n = (size_t)(cham_tile_src->tile.n); + size_t ld_src = (size_t)(cham_tile_src->tile.ld); + size_t ld_dst = (size_t)(cham_tile_dst->tile.ld); int ret = 0; void *src_mat = CHAM_tile_get_ptr( &(cham_tile_src->tile) ); @@ -585,8 +585,8 @@ static int cti_copy_any_to_any( void *src_interface, unsigned src_node, assert( ld_src >= m ); assert( ld_dst >= m ); - assert( m == cham_tile_dst->tile.m ); - assert( n == cham_tile_dst->tile.n ); + assert( m == (size_t)(cham_tile_dst->tile.m) ); + assert( n == (size_t)(cham_tile_dst->tile.n) ); #if defined(CHAMELEON_KERNELS_TRACE) fprintf( stderr,