From 4855a847e036efbd5c1a75e54c2cff2c5c54c241 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 16 Feb 2022 10:49:13 +0100
Subject: [PATCH] starpu/codelets: Remove the usage of the CUDA macros

---
 runtime/starpu/codelets/codelet_zgeadd.c     | 17 +++++------------
 runtime/starpu/codelets/codelet_zgemm.c      |  9 ++-------
 runtime/starpu/codelets/codelet_zgemv.c      | 10 ++--------
 runtime/starpu/codelets/codelet_zhemm.c      | 11 ++---------
 runtime/starpu/codelets/codelet_zher2k.c     | 11 ++---------
 runtime/starpu/codelets/codelet_zherfb.c     |  9 ++-------
 runtime/starpu/codelets/codelet_zherk.c      | 11 ++---------
 runtime/starpu/codelets/codelet_zsymm.c      | 11 ++---------
 runtime/starpu/codelets/codelet_zsyr2k.c     | 11 ++---------
 runtime/starpu/codelets/codelet_zsyrk.c      | 11 ++---------
 runtime/starpu/codelets/codelet_ztpmlqt.c    |  9 ++-------
 runtime/starpu/codelets/codelet_ztpmqrt.c    |  9 ++-------
 runtime/starpu/codelets/codelet_ztrmm.c      | 11 ++---------
 runtime/starpu/codelets/codelet_ztrsm.c      | 11 ++---------
 runtime/starpu/codelets/codelet_zunmlq.c     |  9 ++-------
 runtime/starpu/codelets/codelet_zunmqr.c     |  9 ++-------
 runtime/starpu/include/chameleon_starpu.h.in |  8 --------
 17 files changed, 35 insertions(+), 142 deletions(-)

diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index d20ccca13..9cfe15c08 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -47,6 +47,7 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg)
 #ifdef CHAMELEON_USE_CUBLAS
 static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_trans_t trans;
     int M;
     int N;
@@ -59,18 +60,10 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg)
     tileB = cti_interface_get(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta );
 
-    RUNTIME_getStream( stream );
-
-    CUDA_zgeadd(
-        trans,
-        M, N,
-        &alpha, tileA->mat, tileA->ld,
-        &beta,  tileB->mat, tileB->ld,
-        stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+    CUDA_zgeadd( trans, M, N,
+                 &alpha, tileA->mat, tileA->ld,
+                 &beta,  tileB->mat, tileB->ld,
+                 handle );
 
     return;
 }
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index a3f584f55..38d6337bb 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -64,6 +64,7 @@ static void
 cl_zgemm_cuda_func( void *descr[], void *cl_arg )
 {
     struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAM_tile_t *tileC;
@@ -72,8 +73,6 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg )
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    RUNTIME_getStream( stream );
-
     assert( tileA->format & CHAMELEON_TILE_FULLRANK );
     assert( tileB->format & CHAMELEON_TILE_FULLRANK );
     assert( tileC->format & CHAMELEON_TILE_FULLRANK );
@@ -86,11 +85,7 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg )
         tileB->mat, tileB->ld,
         (cuDoubleComplex*)&(clargs->beta),
         tileC->mat, tileC->ld,
-        stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+        handle );
 
     return;
 }
diff --git a/runtime/starpu/codelets/codelet_zgemv.c b/runtime/starpu/codelets/codelet_zgemv.c
index 89fdd39f2..3aedbfa41 100644
--- a/runtime/starpu/codelets/codelet_zgemv.c
+++ b/runtime/starpu/codelets/codelet_zgemv.c
@@ -47,6 +47,7 @@ static void cl_zgemv_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA) & 0
 static void cl_zgemv_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_trans_t transA;
     cham_trans_t transB;
     int m;
@@ -64,20 +65,13 @@ static void cl_zgemv_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &beta);
 
-    RUNTIME_getStream( stream );
-
     CUDA_zgemv(
         transA, transB,
         m, n, k,
         &alpha, tileA->mat, tileA->ld,
                 tileB->mat, tileB->ld,
         &beta,  tileC->mat, tileC->ld,
-        stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
+        handle );
     return;
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index 2b85fbdae..20d0e352b 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -54,6 +54,7 @@ static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
 #ifdef CHAMELEON_USE_CUDA
 static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_uplo_t uplo;
     int M;
@@ -70,21 +71,13 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zhemm(
         side, uplo,
         M, N,
         &alpha, tileA->mat, tileA->ld,
                 tileB->mat, tileB->ld,
         &beta,  tileC->mat, tileC->ld,
-        stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* CHAMELEON_USE_CUDA */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index a3d12b158..1f9585e58 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -51,6 +51,7 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
 #ifdef CHAMELEON_USE_CUDA
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_uplo_t uplo;
     cham_trans_t trans;
     int n;
@@ -67,20 +68,12 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zher2k( uplo, trans,
                  n, k,
                  &alpha, tileA->mat, tileA->ld,
                          tileB->mat, tileB->ld,
                  &beta,  tileC->mat, tileC->ld,
-                 stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+                 handle );
 }
 #endif /* CHAMELEON_USE_CUDA */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c
index 4cc999337..af9b2a256 100644
--- a/runtime/starpu/codelets/codelet_zherfb.c
+++ b/runtime/starpu/codelets/codelet_zherfb.c
@@ -47,6 +47,7 @@ static void cl_zherfb_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_zherfb_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_uplo_t uplo;
     int n, k, ib, nb;
     CHAM_tile_t *tileA;
@@ -55,8 +56,6 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg)
     CHAM_tile_t *tileW;
     int ldW;
 
-    RUNTIME_getStream(stream);
-
     tileA = cti_interface_get(descr[0]);
     tileT = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
@@ -68,11 +67,7 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg)
                  tileA->mat, tileA->ld,
                  tileT->mat, tileT->ld,
                  tileC->mat, tileC->ld,
-                 tileW->mat, ldW, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+                 tileW->mat, ldW, handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 008d1822d..d877cda36 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -57,6 +57,7 @@ cl_zherk_cpu_func(void *descr[], void *cl_arg)
 static void
 cl_zherk_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
@@ -64,21 +65,13 @@ cl_zherk_cuda_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zherk(
         clargs->uplo, clargs->trans, clargs->n, clargs->k,
         &(clargs->alpha),
         tileA->mat, tileA->ld,
         &(clargs->beta),
         tileC->mat, tileC->ld,
-        stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index e03e56be3..8026b32fe 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -54,6 +54,7 @@ static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
 #ifdef CHAMELEON_USE_CUDA
 static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_uplo_t uplo;
     int M;
@@ -70,21 +71,13 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zsymm(
         side, uplo,
         M, N,
         &alpha, tileA->mat, tileA->ld,
                 tileB->mat, tileB->ld,
         &beta,  tileC->mat, tileC->ld,
-        stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* CHAMELEON_USE_CUDA */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 5a5c54787..c020938bf 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -51,6 +51,7 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
 #ifdef CHAMELEON_USE_CUDA
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_uplo_t uplo;
     cham_trans_t trans;
     int n;
@@ -67,20 +68,12 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zsyr2k( uplo, trans,
                  n, k,
                  &alpha, tileA->mat, tileA->ld,
                          tileB->mat, tileB->ld,
                  &beta,  tileC->mat, tileC->ld,
-                 stream);
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+                 handle );
 }
 #endif /* CHAMELEON_USE_CUDA */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index ee6f6aa53..1d09960c9 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -57,6 +57,7 @@ cl_zsyrk_cpu_func(void *descr[], void *cl_arg)
 static void
 cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
@@ -64,21 +65,13 @@ cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    RUNTIME_getStream(stream);
-
     CUDA_zsyrk(
         clargs->uplo, clargs->trans, clargs->n, clargs->k,
         (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         (cuDoubleComplex*)&(clargs->beta),
         tileC->mat, tileC->ld,
-        stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c
index 076144bc9..ce1e70b15 100644
--- a/runtime/starpu/codelets/codelet_ztpmlqt.c
+++ b/runtime/starpu/codelets/codelet_ztpmlqt.c
@@ -51,6 +51,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_trans_t trans;
     int M;
@@ -73,19 +74,13 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork );
 
-    RUNTIME_getStream(stream);
-
     CUDA_ztpmlqt(
             side, trans, M, N, K, L, ib,
             tileV->mat, tileV->ld,
             tileT->mat, tileT->ld,
             tileA->mat, tileA->ld,
             tileB->mat, tileB->ld,
-            tileW->mat, lwork, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+            tileW->mat, lwork, handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c
index 66461e6c4..a8870a36f 100644
--- a/runtime/starpu/codelets/codelet_ztpmqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpmqrt.c
@@ -51,6 +51,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_trans_t trans;
     int M;
@@ -73,19 +74,13 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib, &lwork );
 
-    RUNTIME_getStream(stream);
-
     CUDA_ztpmqrt(
             side, trans, M, N, K, L, ib,
             tileV->mat, tileV->ld,
             tileT->mat, tileT->ld,
             tileA->mat, tileA->ld,
             tileB->mat, tileB->ld,
-            tileW->mat, lwork, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+            tileW->mat, lwork, handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index 1c6a2901c..0a34ea60a 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -57,6 +57,7 @@ cl_ztrmm_cpu_func(void *descr[], void *cl_arg)
 static void
 cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
@@ -64,21 +65,13 @@ cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    RUNTIME_getStream(stream);
-
     CUDA_ztrmm(
         clargs->side, clargs->uplo, clargs->transA, clargs->diag,
         clargs->m, clargs->n,
         (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         tileB->mat, tileB->ld,
-        stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 423e95fc9..debe56add 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -59,27 +59,20 @@ static void
 cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
     struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    RUNTIME_getStream(stream);
-
     CUDA_ztrsm(
         clargs->side, clargs->uplo, clargs->transA, clargs->diag,
         clargs->m, clargs->n,
         (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         tileB->mat, tileB->ld,
-        stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-
-    return;
+        handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index e0d55765a..19efea82b 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -59,6 +59,7 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_trans_t trans;
     int m;
@@ -78,18 +79,12 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW );
 
-    RUNTIME_getStream(stream);
-
     CUDA_zunmlqt(
             side, trans, m, n, k, ib,
             tileA->mat, tileA->ld,
             tileT->mat, tileT->ld,
             tileC->mat, tileC->ld,
-            tileW->mat, ldW, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+            tileW->mat, ldW, handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index 58cad3e8a..31489663d 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -58,6 +58,7 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg)
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 {
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
     cham_side_t side;
     cham_trans_t trans;
     int m;
@@ -77,18 +78,12 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &m, &n, &k, &ib, &ldW );
 
-    RUNTIME_getStream(stream);
-
     CUDA_zunmqrt(
             side, trans, m, n, k, ib,
             tileA->mat, tileA->ld,
             tileT->mat, tileT->ld,
             tileC->mat, tileC->ld,
-            tileW->mat, ldW, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
+            tileW->mat, ldW, handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index b3a3e8e70..15391b62d 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -125,14 +125,6 @@ typedef struct starpu_option_request_s {
 
 #endif
 
-/*
- * cuBLAS API - StarPU enable the support for cublas handle
- */
-#if defined(CHAMELEON_USE_CUDA)
-#define RUNTIME_getStream(_stream_)                             \
-    cublasHandle_t _stream_ = starpu_cublas_get_local_handle();
-#endif
-
 /*
  * Enable codelets names
  */
-- 
GitLab