diff --git a/CMakeLists.txt b/CMakeLists.txt index 55d8bca913e2a784ce9a322231fdfcc7987b1db7..0802349b53656984d5be450c6cfbb1da1665df58 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1003,6 +1003,12 @@ if( CHAMELEON_SCHED_QUARK ) endif() +# Add option to exploit cublas API v2 +# ----------------------------------- +cmake_dependent_option(CHAMELEON_USE_CUBLAS_V2 + "Enable cublas API v2" ON + "CHAMELEON_USE_CUDA;CHAMELEON_SCHED_STARPU" OFF) + list(REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS) string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}") # Fix a problem on Mac OS X when building shared libraries diff --git a/control/common.h b/control/common.h index f08f2712c91a98fe18790d40331168d06b325f71..ab1a98867bfed1be80fd09e0db14969773b93e6b 100644 --- a/control/common.h +++ b/control/common.h @@ -54,6 +54,7 @@ #include <cuda_runtime.h> #include <cuda_runtime_api.h> #if defined(CHAMELEON_USE_CUBLAS_V2) +#include <cublas.h> #include <cublas_v2.h> #else #include <cublas.h> diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt index 06d25bd476f7d7f5286e40eefd583cee53d2de2a..39a9d20be5bb4f73d6892417af809b2db0f9be90 100644 --- a/cudablas/compute/CMakeLists.txt +++ b/cudablas/compute/CMakeLists.txt @@ -65,12 +65,20 @@ if( CHAMELEON_USE_MAGMA ) ) endif() -precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}" - PRECISIONS "${CHAMELEON_PRECISION}") +precisions_rules_py( + CUDABLAS_SRCS_GENERATED "${ZSRC}" + PRECISIONS "${CHAMELEON_PRECISION}") set(CUDABLAS_SRCS - ${CUDABLAS_SRCS_GENERATED} + ${CUDABLAS_SRCS_GENERATED} + ) + +if (CHAMELEON_USE_CUBLAS_V2) + set(CUDABLAS_SRCS + ${CUDABLAS_SRCS} + cudaglobal.c ) +endif (CHAMELEON_USE_CUBLAS_V2) # Compile step # ------------ diff --git a/cudablas/compute/cuda_zgemm.c b/cudablas/compute/cuda_zgemm.c index 0c6a61b4f43a214eeaa83a3b2f4c356c22207adb..c5d0503b531443ab37b7dbeb127b3af672ec0250 100644 --- a/cudablas/compute/cuda_zgemm.c +++ b/cudablas/compute/cuda_zgemm.c @@ -34,18 +34,13 @@ int CUDA_zgemm(MORSE_enum transa, MORSE_enum transb, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { - -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif cublasZgemm(CUBLAS_HANDLE - morse_lapack_const(transa), morse_lapack_const(transb), + morse_cublas_const(transa), morse_cublas_const(transb), m, n, k, CUBLAS_VALUE(alpha), A, lda, B, ldb, CUBLAS_VALUE(beta), C, ldc); assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); - return MORSE_SUCCESS; } diff --git a/cudablas/compute/cuda_zhemm.c b/cudablas/compute/cuda_zhemm.c index 8ed742362c32c0c6d0e2891510c333efb7cf431d..5cb93159e26ddbe5ddc3fb2608f1436b7dcf9f79 100644 --- a/cudablas/compute/cuda_zhemm.c +++ b/cudablas/compute/cuda_zhemm.c @@ -34,18 +34,13 @@ int CUDA_zhemm(MORSE_enum side, MORSE_enum uplo, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZhemm(CUBLAS_HANDLE - morse_lapack_const(side), morse_lapack_const(uplo), + morse_cublas_const(side), morse_cublas_const(uplo), m, n, CUBLAS_VALUE(alpha), A, lda, B, ldb, CUBLAS_VALUE(beta), C, ldc); assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); - return MORSE_SUCCESS; } diff --git a/cudablas/compute/cuda_zher2k.c b/cudablas/compute/cuda_zher2k.c index 5b340abd58953d91f2d8a60965ab935173207972..206d0d50dc8e13dbf6aa55b528808088d057a11a 100644 --- a/cudablas/compute/cuda_zher2k.c +++ b/cudablas/compute/cuda_zher2k.c @@ -34,12 +34,8 @@ int CUDA_zher2k(MORSE_enum uplo, MORSE_enum trans, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZher2k(CUBLAS_HANDLE - morse_lapack_const(uplo), morse_lapack_const(trans), + morse_cublas_const(uplo), morse_cublas_const(trans), n, k, CUBLAS_VALUE(alpha), A, lda, B, ldb, diff --git a/cudablas/compute/cuda_zherk.c b/cudablas/compute/cuda_zherk.c index 39717408795f1dbd70076d4ccf81b7867888489f..f5eec6d1721efefdce0f5a8ce974887b00c9b1c6 100644 --- a/cudablas/compute/cuda_zherk.c +++ b/cudablas/compute/cuda_zherk.c @@ -33,15 +33,11 @@ int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - - cublasZherk( - morse_lapack_const(uplo), morse_lapack_const(trans), - n, k, - *alpha, A, lda, - *beta, B, ldb); + cublasZherk( CUBLAS_HANDLE + morse_cublas_const(uplo), morse_cublas_const(trans), + n, k, + CUBLAS_VALUE(alpha), A, lda, + CUBLAS_VALUE(beta), B, ldb); assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c index c35b555861e140400f223dc7a5d460a75286def8..0faee149e21d22aa9d4092da6aa5485997c5e0a1 100644 --- a/cudablas/compute/cuda_zlarfb.c +++ b/cudablas/compute/cuda_zlarfb.c @@ -49,10 +49,6 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum transT, uplo, notransV, transV; -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - /* Check input arguments */ if ((side != MorseLeft) && (side != MorseRight)) { return -1; @@ -107,23 +103,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, // W = C^H V cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseConjTrans), morse_lapack_const(notransV), + morse_cublas_const(MorseConjTrans), morse_cublas_const(notransV), N, K, M, CUBLAS_SADDR(zone), C, LDC, V, LDV, CUBLAS_SADDR(zzero), WORK, LDWORK ); // W = W T^H = C^H V T^H - cublasZtrmm( CUBLAS_HANDLE - morse_lapack_const(MorseRight), morse_lapack_const(uplo), - morse_lapack_const(transT), morse_lapack_const(MorseNonUnit), - N, K, CUBLAS_SADDR(zone), - T, LDT, - WORK, LDWORK); + CUDA_ztrmm( MorseRight, uplo, transT, MorseNonUnit, + N, K, + CUBLAS_SADDR(zone), T, LDT, + WORK, LDWORK, + CUBLAS_STREAM_VALUE ); // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(notransV), morse_lapack_const(MorseConjTrans), + morse_cublas_const(notransV), morse_cublas_const(MorseConjTrans), M, N, K, CUBLAS_SADDR(mzone), V, LDV, WORK, LDWORK, @@ -135,23 +130,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, // W = C V cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(notransV), + morse_cublas_const(MorseNoTrans), morse_cublas_const(notransV), M, K, N, CUBLAS_SADDR(zone), C, LDC, V, LDV, CUBLAS_SADDR(zzero), WORK, LDWORK ); // W = W T = C V T - cublasZtrmm( CUBLAS_HANDLE - morse_lapack_const(MorseRight), morse_lapack_const(uplo), - morse_lapack_const(trans), morse_lapack_const(MorseNonUnit), - M, K, CUBLAS_SADDR(zone), - T, LDT, - WORK, LDWORK); + CUDA_ztrmm( MorseRight, uplo, trans, MorseNonUnit, + M, K, + CUBLAS_SADDR(zone), T, LDT, + WORK, LDWORK, + CUBLAS_STREAM_VALUE ); // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(transV), + morse_cublas_const(MorseNoTrans), morse_cublas_const(transV), M, N, K, CUBLAS_SADDR(mzone), WORK, LDWORK, V, LDV, diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c index edca0c8d7e13b14995ac2c74857cdfa5106e38ca..e3e29f9569d37451b8741861a238315ed122115b 100644 --- a/cudablas/compute/cuda_zparfb.c +++ b/cudablas/compute/cuda_zparfb.c @@ -243,7 +243,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, transA2 = storev == MorseColumnwise ? MorseNoTrans : MorseConjTrans; cublasZgemm(CUBLAS_HANDLE - morse_lapack_const(transW), morse_lapack_const(MorseNoTrans), + morse_cublas_const(transW), morse_cublas_const(MorseNoTrans), K, N1, M2, CUBLAS_SADDR(zone), V /* K*M2 */, LDV, @@ -253,14 +253,11 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, if (WORKC == NULL) { /* W = op(T) * W */ - cublasZtrmm( CUBLAS_HANDLE - morse_lapack_const(MorseLeft), morse_lapack_const(MorseUpper), - morse_lapack_const(trans), morse_lapack_const(MorseNonUnit), - K, N2, - CUBLAS_SADDR(zone), - T, LDT, - WORK, LDWORK); - + CUDA_ztrmm( MorseLeft, MorseUpper, trans, MorseNonUnit, + K, N2, + CUBLAS_SADDR(zone), T, LDT, + WORK, LDWORK, + CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - op(T) * W */ for(j = 0; j < N1; j++) { @@ -272,7 +269,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* A2 = A2 - op(V) * W */ cublasZgemm(CUBLAS_HANDLE - morse_lapack_const(transA2), morse_lapack_const(MorseNoTrans), + morse_cublas_const(transA2), morse_cublas_const(MorseNoTrans), M2, N2, K, CUBLAS_SADDR(mzone), V /* M2*K */, LDV, WORK /* K*N2 */, LDWORK, @@ -281,7 +278,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, } else { /* Wc = V * op(T) */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(transA2), morse_lapack_const(trans), + morse_cublas_const(transA2), morse_cublas_const(trans), M2, K, K, CUBLAS_SADDR(zone), V, LDV, T, LDT, @@ -289,7 +286,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* A1 = A1 - opt(T) * W */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(trans), morse_lapack_const(MorseNoTrans), + morse_cublas_const(trans), morse_cublas_const(MorseNoTrans), K, N1, K, CUBLAS_SADDR(mzone), T, LDT, WORK, LDWORK, @@ -297,7 +294,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* A2 = A2 - Wc * W */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(MorseNoTrans), + morse_cublas_const(MorseNoTrans), morse_cublas_const(MorseNoTrans), M2, N2, K, CUBLAS_SADDR(mzone), WORKC, LDWORKC, WORK, LDWORK, @@ -328,7 +325,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, transA2 = storev == MorseColumnwise ? MorseConjTrans : MorseNoTrans; cublasZgemm(CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(transW), + morse_cublas_const(MorseNoTrans), morse_cublas_const(transW), M1, K, N2, CUBLAS_SADDR(zone), A2 /* M1*N2 */, LDA2, V /* N2*K */, LDV, @@ -336,14 +333,11 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, if (WORKC == NULL) { /* W = W * op(T) */ - cublasZtrmm( CUBLAS_HANDLE - morse_lapack_const(MorseRight), morse_lapack_const(MorseUpper), - morse_lapack_const(trans), morse_lapack_const(MorseNonUnit), - M2, K, - CUBLAS_SADDR(zone), - T, LDT, - WORK, LDWORK); - + CUDA_ztrmm( MorseRight, MorseUpper, trans, MorseNonUnit, + M2, K, + CUBLAS_SADDR(zone), T, LDT, + WORK, LDWORK, + CUBLAS_STREAM_VALUE ); /* A1 = A1 - W = A1 - W * op(T) */ for(j = 0; j < K; j++) { @@ -355,7 +349,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* A2 = A2 - W * op(V) */ cublasZgemm(CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(transA2), + morse_cublas_const(MorseNoTrans), morse_cublas_const(transA2), M2, N2, K, CUBLAS_SADDR(mzone), WORK /* M2*K */, LDWORK, V /* K*N2 */, LDV, @@ -364,7 +358,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, } else { /* A1 = A1 - W * opt(T) */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(trans), + morse_cublas_const(MorseNoTrans), morse_cublas_const(trans), M1, K, K, CUBLAS_SADDR(mzone), WORK, LDWORK, T, LDT, @@ -372,7 +366,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* Wc = op(T) * V */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(trans), morse_lapack_const(transA2), + morse_cublas_const(trans), morse_cublas_const(transA2), K, N2, K, CUBLAS_SADDR(zone), T, LDT, V, LDV, @@ -380,7 +374,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans, /* A2 = A2 - W * Wc */ cublasZgemm( CUBLAS_HANDLE - morse_lapack_const(MorseNoTrans), morse_lapack_const(MorseNoTrans), + morse_cublas_const(MorseNoTrans), morse_cublas_const(MorseNoTrans), M2, N2, K, CUBLAS_SADDR(mzone), WORK, LDWORK, WORKC, LDWORKC, diff --git a/cudablas/compute/cuda_zsymm.c b/cudablas/compute/cuda_zsymm.c index 93ec3f386942d33cee4250721be2a794679e581a..5213565edb79207844345170f9de2cb8549a7d1b 100644 --- a/cudablas/compute/cuda_zsymm.c +++ b/cudablas/compute/cuda_zsymm.c @@ -34,12 +34,8 @@ int CUDA_zsymm(MORSE_enum side, MORSE_enum uplo, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZsymm(CUBLAS_HANDLE - morse_lapack_const(side), morse_lapack_const(uplo), + morse_cublas_const(side), morse_cublas_const(uplo), m, n, CUBLAS_VALUE(alpha), A, lda, B, ldb, diff --git a/cudablas/compute/cuda_zsyr2k.c b/cudablas/compute/cuda_zsyr2k.c index 839ea2d68774c3a24f746048f2d9737d53640b85..c80babbad1e968f721881b630a18515ce25de51a 100644 --- a/cudablas/compute/cuda_zsyr2k.c +++ b/cudablas/compute/cuda_zsyr2k.c @@ -35,12 +35,8 @@ int CUDA_zsyr2k( cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZsyr2k(CUBLAS_HANDLE - morse_lapack_const(uplo), morse_lapack_const(trans), + morse_cublas_const(uplo), morse_cublas_const(trans), n, k, CUBLAS_VALUE(alpha), A, lda, B, ldb, diff --git a/cudablas/compute/cuda_zsyrk.c b/cudablas/compute/cuda_zsyrk.c index e988fbc41d9f56abd0cc73f8d42190fdea81bef0..11d18ea1556b15b3511ad20b50451c5cd76c06d3 100644 --- a/cudablas/compute/cuda_zsyrk.c +++ b/cudablas/compute/cuda_zsyrk.c @@ -33,12 +33,8 @@ int CUDA_zsyrk(MORSE_enum uplo, MORSE_enum trans, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZsyrk(CUBLAS_HANDLE - morse_lapack_const(uplo), morse_lapack_const(trans), + morse_cublas_const(uplo), morse_cublas_const(trans), n, k, CUBLAS_VALUE(alpha), A, lda, CUBLAS_VALUE(beta), C, ldc); diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c index d86fa5267fe483ab5517d393aa93913f1bf7d512..349a6f98a02597c81b7dbec7cf395b3cd8498ffc 100644 --- a/cudablas/compute/cuda_ztrmm.c +++ b/cudablas/compute/cuda_ztrmm.c @@ -34,17 +34,29 @@ int CUDA_ztrmm( cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZtrmm(CUBLAS_HANDLE - morse_lapack_const(side), morse_lapack_const(uplo), - morse_lapack_const(transa), morse_lapack_const(diag), +#if defined(CHAMELEON_USE_CUBLAS_V2) + + cublasZtrmm( + CUBLAS_HANDLE + morse_cublas_const(side), morse_cublas_const(uplo), + morse_cublas_const(transa), morse_cublas_const(diag), m, n, CUBLAS_VALUE(alpha), A, lda, + B, ldb, B, ldb); +#else + + cublasZtrmm( + CUBLAS_HANDLE + morse_cublas_const(side), morse_cublas_const(uplo), + morse_cublas_const(transa), morse_cublas_const(diag), + m, n, + CUBLAS_VALUE(alpha), A, lda, + B, ldb); +#endif + assert( CUBLAS_STATUS_SUCCESS == cublasGetError() ); return MORSE_SUCCESS; diff --git a/cudablas/compute/cuda_ztrsm.c b/cudablas/compute/cuda_ztrsm.c index d82766bc8e7710a3eb65aac2e2ae57e3427bf9a6..ae485b751b79ea71b04fbe9ce34c3aab08d97b36 100644 --- a/cudablas/compute/cuda_ztrsm.c +++ b/cudablas/compute/cuda_ztrsm.c @@ -33,13 +33,9 @@ int CUDA_ztrsm(MORSE_enum side, MORSE_enum uplo, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM) { -#if !defined(CHAMELEON_USE_CUBLAS_V2) - cublasSetKernelStream( stream ); -#endif - cublasZtrsm(CUBLAS_HANDLE - morse_lapack_const(side), morse_lapack_const(uplo), - morse_lapack_const(transa), morse_lapack_const(diag), + morse_cublas_const(side), morse_cublas_const(uplo), + morse_cublas_const(transa), morse_cublas_const(diag), m, n, CUBLAS_VALUE(alpha), A, lda, B, ldb); diff --git a/cudablas/compute/cudaglobal.c b/cudablas/compute/cudaglobal.c new file mode 100644 index 0000000000000000000000000000000000000000..597501e9e45318c282226525dc2367afece94873 --- /dev/null +++ b/cudablas/compute/cudaglobal.c @@ -0,0 +1,127 @@ +/** + * + * @copyright (c) 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + **/ +/** + * + * @file cudaglobal.c + * + * MORSE auxiliary routines + * MORSE is a software package provided by Univ. of Tennessee, + * Univ. of California Berkeley and Univ. of Colorado Denver + * + * @version 0.9.0 + * @author Mathieu Faverge + * @date 2017-04-06 + * + **/ +#include "cudablas/include/cudablas.h" + +/******************************************************************************* + * LAPACK Constants + **/ +int morse_cublas_constants[] = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 100 + 0, // 101: MorseRowMajor + 0, // 102: MorseColMajor + 0, 0, 0, 0, 0, 0, 0, 0, + CUBLAS_OP_N, // 111: MorseNoTrans + CUBLAS_OP_T, // 112: MorseTrans + CUBLAS_OP_C, // 113: MorseConjTrans + 0, 0, 0, 0, 0, 0, 0, + CUBLAS_FILL_MODE_UPPER, // 121: MorseUpper + CUBLAS_FILL_MODE_LOWER, // 122: MorseLower + 0, // 123: MorseUpperLower + 0, 0, 0, 0, 0, 0, 0, + CUBLAS_DIAG_NON_UNIT, // 131: MorseNonUnit + CUBLAS_DIAG_UNIT, // 132: MorseUnit + 0, 0, 0, 0, 0, 0, 0, 0, + CUBLAS_SIDE_LEFT, // 141: MorseLeft + CUBLAS_SIDE_RIGHT, // 142: MorseRight + 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 151: + 0, // 152: + 0, // 153: + 0, // 154: + 0, // 155: + 0, // 156: + 0, // 157: MorseEps + 0, // 158: + 0, // 159: + 0, // 160: + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 171: MorseOneNorm + 0, // 172: MorseRealOneNorm + 0, // 173: MorseTwoNorm + 0, // 174: MorseFrobeniusNorm + 0, // 175: MorseInfNorm + 0, // 176: MorseRealInfNorm + 0, // 177: MorseMaxNorm + 0, // 178: MorseRealMaxNorm + 0, // 179 + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 200 + 0, // 201: MorseDistUniform + 0, // 202: MorseDistSymmetric + 0, // 203: MorseDistNormal + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 240 + 0, // 241 MorseHermGeev + 0, // 242 MorseHermPoev + 0, // 243 MorseNonsymPosv + 0, // 244 MorseSymPosv + 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 290 + 0, // 291 MorseNoPacking + 0, // 292 MorsePackSubdiag + 0, // 293 MorsePackSupdiag + 0, // 294 MorsePackColumn + 0, // 295 MorsePackRow + 0, // 296 MorsePackLowerBand + 0, // 297 MorsePackUpeprBand + 0, // 298 MorsePackAll + 0, // 299 + 0, // 300 + 0, // 301 MorseNoVec + 0, // 302 MorseVec + 0, // 303 MorseIvec + 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 390 + 0, // 391 + 0, // 392 + 0, 0, 0, 0, 0, 0, 0, 0, + 0, // 401 + 0, // 402 + 0, 0, 0, 0, 0, 0, 0, 0 // Remember to add a coma! +}; diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h index d6c2004fe9bfa69e0b7ddb1fe9accb2f23f521e1..b181fc8332a36c1024f80c97d175e29f4ad1e0ae 100644 --- a/cudablas/include/cudablas.h +++ b/cudablas/include/cudablas.h @@ -41,7 +41,9 @@ #if defined(CHAMELEON_USE_CUBLAS_V2) +#include <cublas.h> #include <cublas_v2.h> + #define CUBLAS_STREAM_PARAM cublasHandle_t handle #define CUBLAS_STREAM_VALUE handle #define CUBLAS_HANDLE handle, @@ -96,4 +98,12 @@ extern char *morse_lapack_constants[]; #define morse_lapack_const(morse_const) morse_lapack_constants[morse_const][0] +extern int morse_cublas_constants[]; + +#if defined(CHAMELEON_USE_CUBLAS_V2) +#define morse_cublas_const(morse_const) morse_cublas_constants[morse_const] +#else +#define morse_cublas_const(morse_const) morse_lapack_constants[morse_const][0] +#endif + #endif diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c index af571f2ae79f096aa8efc680c16957b334fa8c48..22355977823a4df1aa73e905cc7db4d158025d1f 100644 --- a/runtime/starpu/codelets/codelet_zgelqt.c +++ b/runtime/starpu/codelets/codelet_zgelqt.c @@ -166,7 +166,6 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; cuDoubleComplex *d_A, *d_T, *d_D, *d_W; int lda, ldt; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); @@ -186,15 +185,14 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + chameleon_max(m,n); h_D = h_W + ib*ib; - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_zgelqt( m, n, ib, d_A, lda, h_A, ib, d_T, ldt, h_T, ib, d_D, h_D, ib, h_TAU, - h_W, d_W, stream); + h_W, d_W, stream ); cudaThreadSynchronize(); } diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 292e071de10a48b70fb34fa6e91ad2dfdc3c7de4..90fe880a741d756713ef0083fd4a73a83f1e225f 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -148,14 +148,13 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream( stream ); CUDA_zgemm( transA, transB, diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c index 50769b3be7323b6342f68889c1138d6d4950d1b2..868c10c16064001632a036ec7b5a7791f6fbb0fa 100644 --- a/runtime/starpu/codelets/codelet_zgeqrt.c +++ b/runtime/starpu/codelets/codelet_zgeqrt.c @@ -166,7 +166,6 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU; cuDoubleComplex *d_A, *d_T, *d_D, *d_W; int lda, ldt; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work); @@ -186,8 +185,7 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + chameleon_max(m,n); h_D = h_W + ib*ib; - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_zgeqrt( m, n, ib, diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index df45234630fbd77f77f7e976eea77cba868c305b..97f0adcc5b3fbf1c275e5feded3fa258b77ed12f 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -119,14 +119,13 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int LDC; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zhemm( side, uplo, diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 9b6b7194db42d17a7fb5eb755efc76ec7c79c09b..7c5509240eb3893fab6e471a2d7beed51159f41c 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -116,14 +116,13 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) double beta; cuDoubleComplex *C; int ldc; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zher2k( uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c index b5ceec04b8ded9bf733c534645ae7a5303294b5b..8b6c5d8d37e80d64da4d526554faaf2d0e67d4be 100644 --- a/runtime/starpu/codelets/codelet_zherfb.c +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -113,9 +113,8 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) int ldc; cuDoubleComplex *WORK; int ldwork; - CUstream stream; - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index e5ef780218f5cf95f13f6956b971350d38268b8f..0576cef52ac2abcca50dbfc36ed5422746f4c5a1 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -108,13 +108,12 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg) double beta; cuDoubleComplex *C; int ldc; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zherk( uplo, trans, diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 71e6b86700dde96065d72c0a0c3e43edac347f2c..bb01bfc7bac752da147983e5ac3a7640836d1925 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -119,14 +119,13 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int LDC; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zsymm( side, uplo, diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index cafe3887e071c926e91f296b79d97210eaf71c15..aec9d75f5a5f43dc087dfdcbb684cc5b68216b12 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -116,14 +116,13 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zsyr2k( uplo, trans, n, k, &alpha, A, lda, B, ldb, &beta, C, ldc, diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index b368779f50cc7af2dcb8644be56720746a4deb37..265a8922d43779fc8897c4a26e48f6ac3d034ea9 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -109,13 +109,12 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex beta; cuDoubleComplex *C; int ldc; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_zsyrk( uplo, trans, diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 5428c9eea8badfdc586898350e80af94497a2f6a..c1344760444deb6f292017ed0252eb0c7d9188df 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -126,7 +126,6 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *B; int ldb; cuDoubleComplex *W; - CUstream stream; V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -137,7 +136,7 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &ldv, &ldt, &lda, &ldb ); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_ztpmqrt( side, trans, M, N, K, L, ib, diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index ca54848703d4778361831948aaebfff235795779..b002c81709b0811d838670c96d38fbdb94086897 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -113,13 +113,12 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) int LDA; cuDoubleComplex *B; int LDB; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha, &LDA, &LDB); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_ztrmm( side, uplo, diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 76903283298f39b36ee4bcfea4263b563a5434ca..288427e82cb5c65809ed9fdfd6bbe48de7adba01 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -135,13 +135,12 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) int lda; cuDoubleComplex *B; int ldb; - CUstream stream; A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha, &lda, &ldb); - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_ztrsm( side, uplo, transA, diag, diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c index 3780308276014e3c7596a79b65d1e32e01df2466..101feebfe80cede6ddb1f759a5883da3a495ce79 100644 --- a/runtime/starpu/codelets/codelet_ztslqt.c +++ b/runtime/starpu/codelets/codelet_ztslqt.c @@ -185,7 +185,6 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W; cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W; int lda1, lda2, ldt; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); @@ -204,7 +203,7 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + chameleon_max(m,n); h_D = h_W + ib*m; - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_ztslqt( m, n, ib, d_A1, lda1, d_A2, lda2, diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c index 3901295404dcd256073c9e34fa72bcbd2d4dfce0..1ec507ca2f66d682624e9b69a83dfcbb105d0f84 100644 --- a/runtime/starpu/codelets/codelet_ztsmlq.c +++ b/runtime/starpu/codelets/codelet_ztsmlq.c @@ -239,7 +239,6 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *W, *WC; int ldwork; int ldworkc; - CUstream stream; A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -253,8 +252,7 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg) WC = W + ib * ldwork; ldworkc = (side == MorseLeft) ? m1 : ib; - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib, A1, lda1, A2, lda2, V, ldv, T, ldt, diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c index ed4f7dbe9303d36cef5385dd33cddb54b9583fd4..c1c177c9279e3e3c71c827fc3601da98663cc264 100644 --- a/runtime/starpu/codelets/codelet_ztsmqr.c +++ b/runtime/starpu/codelets/codelet_ztsmqr.c @@ -270,7 +270,6 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *W, *WC; int ldwork; int ldworkc; - CUstream stream; A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]); A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]); @@ -284,8 +283,7 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg) WC = W + ib * (side == MorseLeft ? m1 : n1); ldworkc = (side == MorseLeft) ? m2 : ib; - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_ztsmqr( side, trans, m1, n1, m2, n2, k, ib, diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c index 880e1ff8a5acf3c455a2d0a4835d092205404e38..4c5d03fe2b4eeb60fa30dd14f9a250f3bfae080c 100644 --- a/runtime/starpu/codelets/codelet_ztsqrt.c +++ b/runtime/starpu/codelets/codelet_ztsqrt.c @@ -176,7 +176,6 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W; cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W; int lda1, lda2, ldt; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work); @@ -195,7 +194,7 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg) h_W = h_TAU + chameleon_max(m,n); h_D = h_W + ib*n; - stream = starpu_cuda_get_local_stream(); + RUNTIME_getStream(stream); CUDA_ztsqrt( m, n, ib, d_A1, lda1, d_A2, lda2, diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index 6525661f635f020ec7c234e456eeca32ecbd177c..b990da74beadad4b29ab49c6acfc1f4dd0cd413d 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -201,7 +201,6 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) const cuDoubleComplex *A, *T; cuDoubleComplex *C, *WORK; int lda, ldt, ldc, ldwork; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &lda, &ldt, &ldc, &ldwork); @@ -211,8 +210,7 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_zunmlqt( side, trans, m, n, k, ib, diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 2b88c1d3b90206b3acd2cc1a238a0ca578c81beb..d4c6d6d813ae9d948d232908da530c1c963acd3e 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -227,7 +227,6 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) const cuDoubleComplex *A, *T; cuDoubleComplex *C, *WORK; int lda, ldt, ldc, ldwork; - CUstream stream; starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib, &lda, &ldt, &ldc, &ldwork); @@ -237,8 +236,7 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]); WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */ - stream = starpu_cuda_get_local_stream(); - cublasSetKernelStream( stream ); + RUNTIME_getStream(stream); CUDA_zunmqrt( side, trans, m, n, k, ib, diff --git a/runtime/starpu/include/morse_starpu.h b/runtime/starpu/include/morse_starpu.h index c1980d3b615f7ef49d51d2797195c7851b2471a7..f0649aeaf585e3f45fc4f5a548b8ab0a5d5c8401 100644 --- a/runtime/starpu/include/morse_starpu.h +++ b/runtime/starpu/include/morse_starpu.h @@ -45,6 +45,13 @@ #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) #include <starpu_scheduler.h> #include <starpu_cuda.h> + +#include <cublas.h> +#include <starpu_cublas.h> +#if defined(CHAMELEON_USE_CUBLAS_V2) +#include <cublas_v2.h> +#include <starpu_cublas_v2.h> +#endif #endif #include "control/common.h" @@ -72,6 +79,19 @@ typedef struct starpu_conf starpu_conf_t; #endif +/* + * cuBlasAPI v2 - StarPU enable the support for cublas handle + */ +#if defined(CHAMELEON_USE_CUDA) && defined(CHAMELEON_USE_CUBLAS_V2) +#define RUNTIME_getStream(_stream_) \ + cublasHandle_t _stream_ = starpu_cublas_get_local_handle(); +#else +#define RUNTIME_getStream(_stream_) \ + cudaStream_t _stream_ = starpu_cuda_get_local_stream(); \ + cublasSetKernelStream( stream ); + +#endif + /* * Enable codelets names */