From 56bcbe05cae2ce89d395fdbdface11eabf35deec Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 16 Feb 2022 10:48:45 +0100 Subject: [PATCH] cudablas: remove the usage of macros in kernels and assert that the return value is a success --- cudablas/compute/cuda_zgeadd.c | 30 +++--- cudablas/compute/cuda_zgemerge.c | 18 ++-- cudablas/compute/cuda_zgemm.c | 34 ++++--- cudablas/compute/cuda_zgeqrt.c | 5 +- cudablas/compute/cuda_zhemm.c | 34 ++++--- cudablas/compute/cuda_zher2k.c | 33 +++--- cudablas/compute/cuda_zherfb.c | 10 +- cudablas/compute/cuda_zherk.c | 29 +++--- cudablas/compute/cuda_zlarfb.c | 63 ++++++------ cudablas/compute/cuda_zparfb.c | 169 ++++++++++++++++--------------- cudablas/compute/cuda_zsymm.c | 33 +++--- cudablas/compute/cuda_zsyr2k.c | 34 ++++--- cudablas/compute/cuda_zsyrk.c | 29 +++--- cudablas/compute/cuda_ztpmlqt.c | 6 +- cudablas/compute/cuda_ztpmqrt.c | 6 +- cudablas/compute/cuda_ztrmm.c | 36 +++---- cudablas/compute/cuda_ztrsm.c | 31 +++--- cudablas/compute/cuda_ztsmlq.c | 24 ++--- cudablas/compute/cuda_ztsmqr.c | 24 ++--- cudablas/compute/cuda_zttmlq.c | 24 ++--- cudablas/compute/cuda_zttmqr.c | 24 ++--- cudablas/compute/cuda_zunmlqt.c | 16 +-- cudablas/compute/cuda_zunmqrt.c | 16 +-- 23 files changed, 385 insertions(+), 343 deletions(-) diff --git a/cudablas/compute/cuda_zgeadd.c b/cudablas/compute/cuda_zgeadd.c index b77093207..78ebaf077 100644 --- a/cudablas/compute/cuda_zgeadd.c +++ b/cudablas/compute/cuda_zgeadd.c @@ -72,22 +72,24 @@ * @retval <0 if -i, the i-th argument had an illegal value * */ -int CUDA_zgeadd(cham_trans_t trans, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *beta, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int CUDA_zgeadd( cham_trans_t trans, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZgeam(CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), - m, n, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), B, ldb, - B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZgeam( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), + m, n, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb, + B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c index 9f3f8d134..de144f790 100644 --- a/cudablas/compute/cuda_zgemerge.c +++ b/cudablas/compute/cuda_zgemerge.c @@ -25,11 +25,12 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, - CUBLAS_STREAM_PARAM) + cublasHandle_t handle ) { - int i; const cuDoubleComplex *cola; cuDoubleComplex *colb; + cublasStatus_t rc; + int i; if (M < 0) { return -1; @@ -44,26 +45,23 @@ CUDA_zgemerge( cham_side_t side, cham_diag_t diag, return -7; } - CUBLAS_GET_STREAM; - if (side == ChamLeft){ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cudaMemcpyAsync(colb , cola, - (i+1)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); + rc = cublasZcopy( handle, i+1, cola, 1, colb, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } }else{ for(i=0; i<N; i++){ cola = A + i*LDA; colb = B + i*LDB; - cudaMemcpyAsync(colb+i , cola+i, - (M-i)*sizeof(cuDoubleComplex), - cudaMemcpyDeviceToDevice, stream); + rc = cublasZcopy( handle, M-i, cola, 1, colb, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } (void)diag; + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgemm.c b/cudablas/compute/cuda_zgemm.c index 7a5900b76..7a9da427a 100644 --- a/cudablas/compute/cuda_zgemm.c +++ b/cudablas/compute/cuda_zgemm.c @@ -20,22 +20,26 @@ */ #include "cudablas.h" -int CUDA_zgemm(cham_trans_t transa, cham_trans_t transb, - int m, int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(transa), chameleon_cublas_const(transb), - m, n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transa), chameleon_cublas_const(transb), + m, n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zgeqrt.c b/cudablas/compute/cuda_zgeqrt.c index 32abd8f90..d2d33026f 100644 --- a/cudablas/compute/cuda_zgeqrt.c +++ b/cudablas/compute/cuda_zgeqrt.c @@ -21,7 +21,8 @@ #include "cudablas.h" #if defined(CHAMELEON_USE_MAGMA) -int CUDA_zgeqrt( +int +CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComplex *da, magma_int_t ldda, magmaDoubleComplex *v, magma_int_t ldv, @@ -32,7 +33,7 @@ int CUDA_zgeqrt( magmaDoubleComplex *tau, magmaDoubleComplex *hwork, magmaDoubleComplex *dwork, - CUstream stream) + CUstream stream ) { #define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1)) #define v_ref(a_1,a_2) ( v+(a_2)*(ldv) + (a_1)) diff --git a/cudablas/compute/cuda_zhemm.c b/cudablas/compute/cuda_zhemm.c index 26a470b34..95201c035 100644 --- a/cudablas/compute/cuda_zhemm.c +++ b/cudablas/compute/cuda_zhemm.c @@ -20,22 +20,26 @@ */ #include "cudablas.h" -int CUDA_zhemm(cham_side_t side, cham_uplo_t uplo, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zhemm( cham_side_t side, cham_uplo_t uplo, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZhemm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZhemm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zher2k.c b/cudablas/compute/cuda_zher2k.c index 0fd08570d..bc4e69c83 100644 --- a/cudablas/compute/cuda_zher2k.c +++ b/cudablas/compute/cuda_zher2k.c @@ -20,23 +20,26 @@ */ #include "cudablas.h" -int CUDA_zher2k(cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const double *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const double *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZher2k(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZher2k( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zherfb.c b/cudablas/compute/cuda_zherfb.c index 13a0d9c24..36e1784fb 100644 --- a/cudablas/compute/cuda_zherfb.c +++ b/cudablas/compute/cuda_zherfb.c @@ -27,7 +27,7 @@ CUDA_zherfb( cham_uplo_t uplo, int n, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { /* Check input arguments */ if ((uplo != ChamUpper) && (uplo != ChamLower)) { @@ -67,21 +67,21 @@ CUDA_zherfb( cham_uplo_t uplo, int n, /* Left */ CUDA_zunmqrt( ChamLeft, ChamConjTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); /* Right */ CUDA_zunmqrt( ChamRight, ChamNoTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { /* Right */ CUDA_zunmlqt( ChamRight, ChamConjTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); /* Left */ CUDA_zunmlqt( ChamLeft, ChamNoTrans, n, n, k, ib, A, lda, T, ldt, C, ldc, WORK, ldwork, - CUBLAS_STREAM_VALUE ); + handle ); } return 0; } diff --git a/cudablas/compute/cuda_zherk.c b/cudablas/compute/cuda_zherk.c index cd0d7ff64..3ac3b66f1 100644 --- a/cudablas/compute/cuda_zherk.c +++ b/cudablas/compute/cuda_zherk.c @@ -20,21 +20,24 @@ */ #include "cudablas.h" -int CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const double *alpha, - const cuDoubleComplex *A, int lda, - const double *beta, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_zherk( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const double *alpha, + const cuDoubleComplex *A, int lda, + const double *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZherk( CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZherk( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index 3cc573ff5..49a9baaa3 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -30,7 +30,7 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -42,8 +42,9 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, double mzone = -1.0; #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ - cham_trans_t transT, notransV, transV; - cham_uplo_t uplo; + cublasStatus_t rc; + cham_trans_t transT, notransV, transV; + cham_uplo_t uplo; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -103,54 +104,58 @@ CUDA_zlarfb( cham_side_t side, cham_trans_t trans, // Comments assume H C. When forming H^H C, T gets transposed via transT. // W = C^H V - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), - N, K, M, - CUBLAS_SADDR(zone), C, LDC, - V, LDV, - CUBLAS_SADDR(zzero), WORK, LDWORK ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamConjTrans), chameleon_cublas_const(notransV), + N, K, M, + CUBLAS_SADDR(zone), C, LDC, + V, LDV, + CUBLAS_SADDR(zzero), WORK, LDWORK ); + assert( rc == CUBLAS_STATUS_SUCCESS ); // W = W T^H = C^H V T^H CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit, N, K, &zone, T, LDT, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), - M, N, K, - CUBLAS_SADDR(mzone), V, LDV, - WORK, LDWORK, - CUBLAS_SADDR(zone), C, LDC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(notransV), chameleon_cublas_const(ChamConjTrans), + M, N, K, + CUBLAS_SADDR(mzone), V, LDV, + WORK, LDWORK, + CUBLAS_SADDR(zone), C, LDC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { // Form C H or C H^H // Comments assume C H. When forming C H^H, T gets transposed via trans. // W = C V - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV), - M, K, N, - CUBLAS_SADDR(zone), C, LDC, - V, LDV, - CUBLAS_SADDR(zzero), WORK, LDWORK ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(notransV), + M, K, N, + CUBLAS_SADDR(zone), C, LDC, + V, LDV, + CUBLAS_SADDR(zzero), WORK, LDWORK ); + assert( rc == CUBLAS_STATUS_SUCCESS ); // W = W T = C V T CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit, M, K, &zone, T, LDT, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV), - M, N, K, - CUBLAS_SADDR(mzone), WORK, LDWORK, - V, LDV, - CUBLAS_SADDR(zone), C, LDC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transV), + M, N, K, + CUBLAS_SADDR(mzone), WORK, LDWORK, + V, LDV, + CUBLAS_SADDR(zone), C, LDC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index 419e2b3e5..476c8b7fa 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -146,7 +146,7 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { #if defined(PRECISION_z) || defined(PRECISION_c) cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0); @@ -159,14 +159,12 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, #endif /* defined(PRECISION_z) || defined(PRECISION_c) */ cuDoubleComplex *workW, *workC, *workV; - int ldW, ldC, ldV; - int j; - cham_trans_t transW; - cham_trans_t transA2; - int wssize = 0; - int wrsize = 0; - - CUBLAS_GET_STREAM; + cublasStatus_t rc; + cudaStream_t stream; + int j, ldW, ldC, ldV; + cham_trans_t transW, transA2; + int wssize = 0; + int wrsize = 0; /* Check input arguments */ if ((side != ChamLeft) && (side != ChamRight)) { @@ -219,6 +217,8 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, return CHAMELEON_SUCCESS; } + cublasGetStream( handle, &stream ); + if (direct == ChamDirForward) { if (side == ChamLeft) { @@ -307,12 +307,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, transW = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), - K, N1, M2, - CUBLAS_SADDR(zone), workV /* M2*K */, ldV, - A2 /* M2*N2 */, LDA2, - CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans), + K, N1, M2, + CUBLAS_SADDR(zone), workV /* M2*K */, ldV, + A2 /* M2*N2 */, LDA2, + CUBLAS_SADDR(zone), workW /* K *N2 */, ldW ); + assert( rc == CUBLAS_STATUS_SUCCESS ); if ( workC == NULL ) { /* W = op(T) * W */ @@ -320,48 +321,52 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, K, N2, &zone, T, LDT, workW, ldW, - CUBLAS_STREAM_VALUE ); + handle ); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { - cublasZaxpy( CUBLAS_HANDLE - K, CUBLAS_SADDR(mzone), - workW + ldW * j, 1, - A1 + LDA1 * j, 1 ); + rc = cublasZaxpy( handle, + K, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } /* A2 = A2 - op(V) * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, - workW /* K * N2 */, ldW, - CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workV /* M2 * K */, ldV, + workW /* K * N2 */, ldW, + CUBLAS_SADDR(zone), A2 /* M2 * N2 */, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { /* Wc = V * op(T) */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(transA2), chameleon_cublas_const(trans), - M2, K, K, - CUBLAS_SADDR(zone), workV, ldV, - T, LDT, - CUBLAS_SADDR(zzero), workC, ldC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(transA2), chameleon_cublas_const(trans), + M2, K, K, + CUBLAS_SADDR(zone), workV, ldV, + T, LDT, + CUBLAS_SADDR(zzero), workC, ldC ); /* A1 = A1 - opt(T) * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), - K, N1, K, - CUBLAS_SADDR(mzone), T, LDT, - workW, ldW, - CUBLAS_SADDR(zone), A1, LDA1 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans), + K, N1, K, + CUBLAS_SADDR(mzone), T, LDT, + workW, ldW, + CUBLAS_SADDR(zone), A1, LDA1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* A2 = A2 - Wc * W */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workC, ldC, - workW, ldW, - CUBLAS_SADDR(zone), A2, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workC, ldC, + workW, ldW, + CUBLAS_SADDR(zone), A2, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } else { @@ -450,12 +455,13 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, transW = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans; transA2 = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans; - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), - M1, K, N2, - CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, - workV /* K *N2 */, ldV, - CUBLAS_SADDR(zone), workW /* M1*K */, ldW); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW), + M1, K, N2, + CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), workW /* M1*K */, ldW); + assert( rc == CUBLAS_STATUS_SUCCESS ); if ( workC == NULL ) { /* W = W * op(T) */ @@ -463,48 +469,53 @@ CUDA_zparfb( cham_side_t side, cham_trans_t trans, M2, K, &zone, T, LDT, workW, ldW, - CUBLAS_STREAM_VALUE ); + handle ); /* A1 = A1 - W = A1 - W * op(T) */ for(j = 0; j < K; j++) { - cublasZaxpy( CUBLAS_HANDLE - M1, CUBLAS_SADDR(mzone), - workW + ldW * j, 1, - A1 + LDA1 * j, 1 ); + rc = cublasZaxpy( handle, + M1, CUBLAS_SADDR(mzone), + workW + ldW * j, 1, + A1 + LDA1 * j, 1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } /* A2 = A2 - W * op(V) */ - cublasZgemm(CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), - M2, N2, K, - CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, - workV /* K *N2 */, ldV, - CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2), + M2, N2, K, + CUBLAS_SADDR(mzone), workW /* M2*K */, ldW, + workV /* K *N2 */, ldV, + CUBLAS_SADDR(zone), A2 /* M2*N2 */, LDA2); + assert( rc == CUBLAS_STATUS_SUCCESS ); } else { /* A1 = A1 - W * opt(T) */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), - M1, K, K, - CUBLAS_SADDR(mzone), workW, ldW, - T, LDT, - CUBLAS_SADDR(zone), A1, LDA1 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans), + M1, K, K, + CUBLAS_SADDR(mzone), workW, ldW, + T, LDT, + CUBLAS_SADDR(zone), A1, LDA1 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* Wc = op(T) * V */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(trans), chameleon_cublas_const(transA2), - K, N2, K, - CUBLAS_SADDR(zone), T, LDT, - workV, ldV, - CUBLAS_SADDR(zzero), workC, ldC ); + rc = cublasZgemm( handle, + chameleon_cublas_const(trans), chameleon_cublas_const(transA2), + K, N2, K, + CUBLAS_SADDR(zone), T, LDT, + workV, ldV, + CUBLAS_SADDR(zzero), workC, ldC ); + assert( rc == CUBLAS_STATUS_SUCCESS ); /* A2 = A2 - W * Wc */ - cublasZgemm( CUBLAS_HANDLE - chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), - M2, N2, K, - CUBLAS_SADDR(mzone), workW, ldW, - workC, ldC, - CUBLAS_SADDR(zone), A2, LDA2 ); + rc = cublasZgemm( handle, + chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans), + M2, N2, K, + CUBLAS_SADDR(mzone), workW, ldW, + workC, ldC, + CUBLAS_SADDR(zone), A2, LDA2 ); + assert( rc == CUBLAS_STATUS_SUCCESS ); } } } diff --git a/cudablas/compute/cuda_zsymm.c b/cudablas/compute/cuda_zsymm.c index 2ed61af73..3dcfa14eb 100644 --- a/cudablas/compute/cuda_zsymm.c +++ b/cudablas/compute/cuda_zsymm.c @@ -20,23 +20,26 @@ */ #include "cudablas.h" -int CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM ) +int +CUDA_zsymm( cham_side_t side, cham_uplo_t uplo, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZsymm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsymm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zsyr2k.c b/cudablas/compute/cuda_zsyr2k.c index 5c40f1d54..9bd078b38 100644 --- a/cudablas/compute/cuda_zsyr2k.c +++ b/cudablas/compute/cuda_zsyr2k.c @@ -20,24 +20,26 @@ */ #include "cudablas.h" -int CUDA_zsyr2k( - cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *B, int ldb, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *B, int ldb, + const cuDoubleComplex *beta, + cuDoubleComplex *C, int ldc, + cublasHandle_t handle ) { - cublasZsyr2k(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsyr2k( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zsyrk.c b/cudablas/compute/cuda_zsyrk.c index b2a61353c..cdf966546 100644 --- a/cudablas/compute/cuda_zsyrk.c +++ b/cudablas/compute/cuda_zsyrk.c @@ -20,21 +20,24 @@ */ #include "cudablas.h" -int CUDA_zsyrk(cham_uplo_t uplo, cham_trans_t trans, - int n, int k, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - const cuDoubleComplex *beta, - cuDoubleComplex *C, int ldc, - CUBLAS_STREAM_PARAM) +int +CUDA_zsyrk( cham_uplo_t uplo, cham_trans_t trans, + int n, int k, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + const cuDoubleComplex *beta, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZsyrk(CUBLAS_HANDLE - chameleon_cublas_const(uplo), chameleon_cublas_const(trans), - n, k, - CUBLAS_VALUE(alpha), A, lda, - CUBLAS_VALUE(beta), C, ldc); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZsyrk( handle, + chameleon_cublas_const(uplo), chameleon_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c index c10a2cb01..58c613a75 100644 --- a/cudablas/compute/cuda_ztpmlqt.c +++ b/cudablas/compute/cuda_ztpmlqt.c @@ -142,7 +142,7 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { int m1, n1; @@ -166,14 +166,14 @@ CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } /* TT case */ else if( L == N ) { CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" ); diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c index bfbc28633..ddcbc1b52 100644 --- a/cudablas/compute/cuda_ztpmqrt.c +++ b/cudablas/compute/cuda_ztpmqrt.c @@ -144,7 +144,7 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, - CUBLAS_STREAM_PARAM ) + cublasHandle_t handle ) { int m1, n1; @@ -168,14 +168,14 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } /* TT case */ else if( L == M ) { CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, A, LDA, B, LDB, V, LDV, T, LDT, WORK, lwork, - CUBLAS_STREAM_VALUE ); + handle ); } else { cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" ); diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c index 67e730097..f177a0931 100644 --- a/cudablas/compute/cuda_ztrmm.c +++ b/cudablas/compute/cuda_ztrmm.c @@ -20,27 +20,27 @@ */ #include "cudablas.h" -int CUDA_ztrmm( - cham_side_t side, cham_uplo_t uplo, - cham_trans_t transa, cham_diag_t diag, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_ztrmm( cham_side_t side, cham_uplo_t uplo, + cham_trans_t transa, cham_diag_t diag, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { + cublasStatus_t rc; - cublasZtrmm( - CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - chameleon_cublas_const(transa), chameleon_cublas_const(diag), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb, - B, ldb); - - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZtrmm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + chameleon_cublas_const(transa), chameleon_cublas_const(diag), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztrsm.c b/cudablas/compute/cuda_ztrsm.c index 5f63d2464..32d8c83c9 100644 --- a/cudablas/compute/cuda_ztrsm.c +++ b/cudablas/compute/cuda_ztrsm.c @@ -20,22 +20,25 @@ */ #include "cudablas.h" -int CUDA_ztrsm(cham_side_t side, cham_uplo_t uplo, - cham_trans_t transa, cham_diag_t diag, - int m, int n, - const cuDoubleComplex *alpha, - const cuDoubleComplex *A, int lda, - cuDoubleComplex *B, int ldb, - CUBLAS_STREAM_PARAM) +int +CUDA_ztrsm( cham_side_t side, cham_uplo_t uplo, + cham_trans_t transa, cham_diag_t diag, + int m, int n, + const cuDoubleComplex *alpha, + const cuDoubleComplex *A, int lda, + cuDoubleComplex *B, int ldb, + cublasHandle_t handle ) { - cublasZtrsm(CUBLAS_HANDLE - chameleon_cublas_const(side), chameleon_cublas_const(uplo), - chameleon_cublas_const(transa), chameleon_cublas_const(diag), - m, n, - CUBLAS_VALUE(alpha), A, lda, - B, ldb); + cublasStatus_t rc; - assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); + rc = cublasZtrsm( handle, + chameleon_cublas_const(side), chameleon_cublas_const(uplo), + chameleon_cublas_const(transa), chameleon_cublas_const(diag), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c index 820518d5d..63918f4b5 100644 --- a/cudablas/compute/cuda_ztsmlq.c +++ b/cudablas/compute/cuda_ztsmlq.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_ztsmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int kb; @@ -133,7 +133,7 @@ int CUDA_ztsmlq( A2, LDA2, V + i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c index e3e576caf..61b055972 100644 --- a/cudablas/compute/cuda_ztsmqr.c +++ b/cudablas/compute/cuda_ztsmqr.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_ztsmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int NQ; @@ -135,7 +135,7 @@ int CUDA_ztsmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c index c86f6611f..4e6ac068a 100644 --- a/cudablas/compute/cuda_zttmlq.c +++ b/cudablas/compute/cuda_zttmlq.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_zttmlq( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_zttmlq( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int kb, l; @@ -134,7 +134,7 @@ int CUDA_zttmlq( A2, LDA2, V + i, LDV, T + LDT * i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c index 8b9036d53..bd2a17569 100644 --- a/cudablas/compute/cuda_zttmqr.c +++ b/cudablas/compute/cuda_zttmqr.c @@ -20,17 +20,17 @@ */ #include "cudablas.h" -int CUDA_zttmqr( - cham_side_t side, cham_trans_t trans, - int M1, int N1, - int M2, int N2, - int K, int IB, - cuDoubleComplex *A1, int LDA1, - cuDoubleComplex *A2, int LDA2, - const cuDoubleComplex *V, int LDV, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *WORK, int LWORK, - CUBLAS_STREAM_PARAM) +int +CUDA_zttmqr( cham_side_t side, cham_trans_t trans, + int M1, int N1, + int M2, int N2, + int K, int IB, + cuDoubleComplex *A1, int LDA1, + cuDoubleComplex *A2, int LDA2, + const cuDoubleComplex *V, int LDV, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *WORK, int LWORK, + cublasHandle_t handle ) { int i, i1, i3; int NQ; @@ -135,7 +135,7 @@ int CUDA_zttmqr( A2, LDA2, V + LDV*i, LDV, T + LDT*i, LDT, - WORK, LWORK, CUBLAS_STREAM_VALUE ); + WORK, LWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c index 5689bfa01..47a3b1980 100644 --- a/cudablas/compute/cuda_zunmlqt.c +++ b/cudablas/compute/cuda_zunmlqt.c @@ -21,13 +21,13 @@ #include "cudablas.h" int -CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, - int M, int N, int K, int IB, - const cuDoubleComplex *A, int LDA, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zunmlqt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int IB, + const cuDoubleComplex *A, int LDA, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + cublasHandle_t handle ) { int i, kb; int i1, i3; @@ -122,7 +122,7 @@ CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, A + LDA * i + i, LDA, T + LDT * i, LDT, C + LDC * jc + ic, LDC, - WORK, LDWORK, CUBLAS_STREAM_VALUE); + WORK, LDWORK, handle ); } return CHAMELEON_SUCCESS; } diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c index a16d10a47..3864bd5ff 100644 --- a/cudablas/compute/cuda_zunmqrt.c +++ b/cudablas/compute/cuda_zunmqrt.c @@ -21,13 +21,13 @@ #include "cudablas.h" int -CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, - int M, int N, int K, int IB, - const cuDoubleComplex *A, int LDA, - const cuDoubleComplex *T, int LDT, - cuDoubleComplex *C, int LDC, - cuDoubleComplex *WORK, int LDWORK, - CUBLAS_STREAM_PARAM ) +CUDA_zunmqrt( cham_side_t side, cham_trans_t trans, + int M, int N, int K, int IB, + const cuDoubleComplex *A, int LDA, + const cuDoubleComplex *T, int LDT, + cuDoubleComplex *C, int LDC, + cuDoubleComplex *WORK, int LDWORK, + cublasHandle_t handle ) { int i, kb; int i1, i3; @@ -116,7 +116,7 @@ CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, T + LDT * i, LDT, C + LDC * jc + ic, LDC, WORK, LDWORK, - CUBLAS_STREAM_VALUE ); + handle ); } return CHAMELEON_SUCCESS; -- GitLab