From 4855a847e036efbd5c1a75e54c2cff2c5c54c241 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 16 Feb 2022 10:49:13 +0100 Subject: [PATCH] starpu/codelets: Remove the usage of the CUDA macros --- runtime/starpu/codelets/codelet_zgeadd.c | 17 +++++------------ runtime/starpu/codelets/codelet_zgemm.c | 9 ++------- runtime/starpu/codelets/codelet_zgemv.c | 10 ++-------- runtime/starpu/codelets/codelet_zhemm.c | 11 ++--------- runtime/starpu/codelets/codelet_zher2k.c | 11 ++--------- runtime/starpu/codelets/codelet_zherfb.c | 9 ++------- runtime/starpu/codelets/codelet_zherk.c | 11 ++--------- runtime/starpu/codelets/codelet_zsymm.c | 11 ++--------- runtime/starpu/codelets/codelet_zsyr2k.c | 11 ++--------- runtime/starpu/codelets/codelet_zsyrk.c | 11 ++--------- runtime/starpu/codelets/codelet_ztpmlqt.c | 9 ++------- runtime/starpu/codelets/codelet_ztpmqrt.c | 9 ++------- runtime/starpu/codelets/codelet_ztrmm.c | 11 ++--------- runtime/starpu/codelets/codelet_ztrsm.c | 11 ++--------- runtime/starpu/codelets/codelet_zunmlq.c | 9 ++------- runtime/starpu/codelets/codelet_zunmqr.c | 9 ++------- runtime/starpu/include/chameleon_starpu.h.in | 8 -------- 17 files changed, 35 insertions(+), 142 deletions(-) diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index d20ccca13..9cfe15c08 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -47,6 +47,7 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUBLAS static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_trans_t trans; int M; int N; @@ -59,18 +60,10 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) tileB = cti_interface_get(descr[1]); starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta ); - RUNTIME_getStream( stream ); - - CUDA_zgeadd( - trans, - M, N, - &alpha, tileA->mat, tileA->ld, - &beta, tileB->mat, tileB->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + CUDA_zgeadd( trans, M, N, + &alpha, tileA->mat, tileA->ld, + &beta, tileB->mat, tileB->ld, + handle ); return; } diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index a3f584f55..38d6337bb 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -64,6 +64,7 @@ static void cl_zgemm_cuda_func( void *descr[], void *cl_arg ) { struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -72,8 +73,6 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - RUNTIME_getStream( stream ); - assert( tileA->format & CHAMELEON_TILE_FULLRANK ); assert( tileB->format & CHAMELEON_TILE_FULLRANK ); assert( tileC->format & CHAMELEON_TILE_FULLRANK ); @@ -86,11 +85,7 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) tileB->mat, tileB->ld, (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + handle ); return; } diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c index 89fdd39f2..3aedbfa41 100644 --- a/runtime/starpu/codelets/codelet_zgemv.c +++ b/runtime/starpu/codelets/codelet_zgemv.c @@ -47,6 +47,7 @@ static void cl_zgemv_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) & 0 static void cl_zgemv_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_trans_t transA; cham_trans_t transB; int m; @@ -64,20 +65,13 @@ static void cl_zgemv_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta); - RUNTIME_getStream( stream ); - CUDA_zgemv( transA, transB, m, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - + handle ); return; } #endif /* defined(CHAMELEON_USE_CUDA) */ diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index 2b85fbdae..20d0e352b 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -54,6 +54,7 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_uplo_t uplo; int M; @@ -70,21 +71,13 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zhemm( side, uplo, M, N, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index a3d12b158..1f9585e58 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -51,6 +51,7 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; cham_trans_t trans; int n; @@ -67,20 +68,12 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zher2k( uplo, trans, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c index 4cc999337..af9b2a256 100644 --- a/runtime/starpu/codelets/codelet_zherfb.c +++ b/runtime/starpu/codelets/codelet_zherfb.c @@ -47,6 +47,7 @@ static void cl_zherfb_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; int n, k, ib, nb; CHAM_tile_t *tileA; @@ -55,8 +56,6 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) CHAM_tile_t *tileW; int ldW; - RUNTIME_getStream(stream); - tileA = cti_interface_get(descr[0]); tileT = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); @@ -68,11 +67,7 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg) tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 008d1822d..d877cda36 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -57,6 +57,7 @@ cl_zherk_cpu_func(void *descr[], void *cl_arg) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; @@ -64,21 +65,13 @@ cl_zherk_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_zherk( clargs->uplo, clargs->trans, clargs->n, clargs->k, &(clargs->alpha), tileA->mat, tileA->ld, &(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index e03e56be3..8026b32fe 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -54,6 +54,7 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_uplo_t uplo; int M; @@ -70,21 +71,13 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zsymm( side, uplo, M, N, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index 5a5c54787..c020938bf 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -51,6 +51,7 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) #ifdef CHAMELEON_USE_CUDA static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_uplo_t uplo; cham_trans_t trans; int n; @@ -67,20 +68,12 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); - RUNTIME_getStream(stream); - CUDA_zsyr2k( uplo, trans, n, k, &alpha, tileA->mat, tileA->ld, tileB->mat, tileB->ld, &beta, tileC->mat, tileC->ld, - stream); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* CHAMELEON_USE_CUDA */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index ee6f6aa53..1d09960c9 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -57,6 +57,7 @@ cl_zsyrk_cpu_func(void *descr[], void *cl_arg) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; @@ -64,21 +65,13 @@ cl_zsyrk_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_zsyrk( clargs->uplo, clargs->trans, clargs->n, clargs->k, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c index 076144bc9..ce1e70b15 100644 --- a/runtime/starpu/codelets/codelet_ztpmlqt.c +++ b/runtime/starpu/codelets/codelet_ztpmlqt.c @@ -51,6 +51,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int M; @@ -73,19 +74,13 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - RUNTIME_getStream(stream); - CUDA_ztpmlqt( side, trans, M, N, K, L, ib, tileV->mat, tileV->ld, tileT->mat, tileT->ld, tileA->mat, tileA->ld, tileB->mat, tileB->ld, - tileW->mat, lwork, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, lwork, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c index 66461e6c4..a8870a36f 100644 --- a/runtime/starpu/codelets/codelet_ztpmqrt.c +++ b/runtime/starpu/codelets/codelet_ztpmqrt.c @@ -51,6 +51,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int M; @@ -73,19 +74,13 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork ); - RUNTIME_getStream(stream); - CUDA_ztpmqrt( side, trans, M, N, K, L, ib, tileV->mat, tileV->ld, tileT->mat, tileT->ld, tileA->mat, tileA->ld, tileB->mat, tileB->ld, - tileW->mat, lwork, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, lwork, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index 1c6a2901c..0a34ea60a 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -57,6 +57,7 @@ cl_ztrmm_cpu_func(void *descr[], void *cl_arg) static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; @@ -64,21 +65,13 @@ cl_ztrmm_cuda_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_ztrmm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, clargs->m, clargs->n, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 423e95fc9..debe56add 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -59,27 +59,20 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - RUNTIME_getStream(stream); - CUDA_ztrsm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, clargs->m, clargs->n, (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif - - return; + handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c index e0d55765a..19efea82b 100644 --- a/runtime/starpu/codelets/codelet_zunmlq.c +++ b/runtime/starpu/codelets/codelet_zunmlq.c @@ -59,6 +59,7 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int m; @@ -78,18 +79,12 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - RUNTIME_getStream(stream); - CUDA_zunmlqt( side, trans, m, n, k, ib, tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c index 58cad3e8a..31489663d 100644 --- a/runtime/starpu/codelets/codelet_zunmqr.c +++ b/runtime/starpu/codelets/codelet_zunmqr.c @@ -58,6 +58,7 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg) #if defined(CHAMELEON_USE_CUDA) static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) { + cublasHandle_t handle = starpu_cublas_get_local_handle(); cham_side_t side; cham_trans_t trans; int m; @@ -77,18 +78,12 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW ); - RUNTIME_getStream(stream); - CUDA_zunmqrt( side, trans, m, n, k, ib, tileA->mat, tileA->ld, tileT->mat, tileT->ld, tileC->mat, tileC->ld, - tileW->mat, ldW, stream ); - -#ifndef STARPU_CUDA_ASYNC - cudaStreamSynchronize( stream ); -#endif + tileW->mat, ldW, handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index b3a3e8e70..15391b62d 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -125,14 +125,6 @@ typedef struct starpu_option_request_s { #endif -/* - * cuBLAS API - StarPU enable the support for cublas handle - */ -#if defined(CHAMELEON_USE_CUDA) -#define RUNTIME_getStream(_stream_) \ - cublasHandle_t _stream_ = starpu_cublas_get_local_handle(); -#endif - /* * Enable codelets names */ -- GitLab