diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index 61a9ae8a5d94685a71ab0e5758e64023f76397c7..0f1c8a8bc277674f7a46528ab43f4559cfab9fdc 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -185,7 +185,7 @@ static void cl_zunmlq_cpu_func(void *descr[], void *cl_arg)
                 A, lda, T, ldt, C, ldc, WORK, ldwork);
 }
 
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
 static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -207,19 +207,24 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
     CUDA_zunmlqt(
             side, trans, m, n, k, ib,
-            A, lda, T, ldt, C, ldc, WORK, ldwork );
+            A, lda, T, ldt, C, ldc, WORK, ldwork, stream );
 
-    cudaThreadSynchronize();
-}
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
 #endif
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
 CODELETS(zunmlq, 4, cl_zunmlq_cpu_func, cl_zunmlq_cuda_func, 0)
 #else
 CODELETS_CPU(zunmlq, 4, cl_zunmlq_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index e4e278fad8806aba04b55646b0eb1d70b15021b0..300f51d9b5f4fc99e8cd1813098e964585bb71b8 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -211,7 +211,7 @@ static void cl_zunmqr_cpu_func(void *descr[], void *cl_arg)
                 A, lda, T, ldt, C, ldc, WORK, ldwork);
 }
 
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
 static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
 {
     MORSE_starpu_ws_t *d_work;
@@ -233,19 +233,24 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
     CUDA_zunmqrt(
             side, trans, m, n, k, ib,
-            A, lda, T, ldt, C, ldc, WORK, ldwork );
+            A, lda, T, ldt, C, ldc, WORK, ldwork, stream );
 
-    cudaThreadSynchronize();
-}
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
 #endif
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUDA)
 CODELETS(zunmqr, 4, cl_zunmqr_cpu_func, cl_zunmqr_cuda_func, 0)
 #else
 CODELETS_CPU(zunmqr, 4, cl_zunmqr_cpu_func)