diff --git a/CMakeLists.txt b/CMakeLists.txt
index 55d8bca913e2a784ce9a322231fdfcc7987b1db7..0802349b53656984d5be450c6cfbb1da1665df58 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1003,6 +1003,12 @@ if( CHAMELEON_SCHED_QUARK )
 
 endif()
 
+# Add option to exploit cublas API v2
+# -----------------------------------
+cmake_dependent_option(CHAMELEON_USE_CUBLAS_V2
+  "Enable cublas API v2" ON
+  "CHAMELEON_USE_CUDA;CHAMELEON_SCHED_STARPU" OFF)
+
 list(REMOVE_DUPLICATES CMAKE_EXE_LINKER_FLAGS)
 string(REPLACE ";" " " CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS}")
 # Fix a problem on Mac OS X when building shared libraries
diff --git a/control/common.h b/control/common.h
index f08f2712c91a98fe18790d40331168d06b325f71..ab1a98867bfed1be80fd09e0db14969773b93e6b 100644
--- a/control/common.h
+++ b/control/common.h
@@ -54,6 +54,7 @@
 #include <cuda_runtime.h>
 #include <cuda_runtime_api.h>
 #if defined(CHAMELEON_USE_CUBLAS_V2)
+#include <cublas.h>
 #include <cublas_v2.h>
 #else
 #include <cublas.h>
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 06d25bd476f7d7f5286e40eefd583cee53d2de2a..39a9d20be5bb4f73d6892417af809b2db0f9be90 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -65,12 +65,20 @@ if( CHAMELEON_USE_MAGMA )
     )
 endif()
 
-precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}"
-                    PRECISIONS "${CHAMELEON_PRECISION}")
+precisions_rules_py(
+  CUDABLAS_SRCS_GENERATED "${ZSRC}"
+  PRECISIONS "${CHAMELEON_PRECISION}")
 
 set(CUDABLAS_SRCS
-    ${CUDABLAS_SRCS_GENERATED}
+  ${CUDABLAS_SRCS_GENERATED}
+  )
+
+if (CHAMELEON_USE_CUBLAS_V2)
+  set(CUDABLAS_SRCS
+    ${CUDABLAS_SRCS}
+    cudaglobal.c
     )
+endif (CHAMELEON_USE_CUBLAS_V2)
 
 # Compile step
 # ------------
diff --git a/cudablas/compute/cuda_zgemm.c b/cudablas/compute/cuda_zgemm.c
index 0c6a61b4f43a214eeaa83a3b2f4c356c22207adb..c5d0503b531443ab37b7dbeb127b3af672ec0250 100644
--- a/cudablas/compute/cuda_zgemm.c
+++ b/cudablas/compute/cuda_zgemm.c
@@ -34,18 +34,13 @@ int CUDA_zgemm(MORSE_enum transa, MORSE_enum transb,
                cuDoubleComplex *C, int ldc,
                CUBLAS_STREAM_PARAM)
 {
-
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
     cublasZgemm(CUBLAS_HANDLE
-                morse_lapack_const(transa), morse_lapack_const(transb),
+                morse_cublas_const(transa), morse_cublas_const(transb),
                 m, n, k,
                 CUBLAS_VALUE(alpha), A, lda,
                                      B, ldb,
                 CUBLAS_VALUE(beta),  C, ldc);
 
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
-
     return MORSE_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_zhemm.c b/cudablas/compute/cuda_zhemm.c
index 8ed742362c32c0c6d0e2891510c333efb7cf431d..5cb93159e26ddbe5ddc3fb2608f1436b7dcf9f79 100644
--- a/cudablas/compute/cuda_zhemm.c
+++ b/cudablas/compute/cuda_zhemm.c
@@ -34,18 +34,13 @@ int CUDA_zhemm(MORSE_enum side, MORSE_enum uplo,
                cuDoubleComplex *C, int ldc,
                CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZhemm(CUBLAS_HANDLE
-                morse_lapack_const(side), morse_lapack_const(uplo),
+                morse_cublas_const(side), morse_cublas_const(uplo),
                 m, n,
                 CUBLAS_VALUE(alpha), A, lda,
                                      B, ldb,
                 CUBLAS_VALUE(beta),  C, ldc);
 
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
-
     return MORSE_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_zher2k.c b/cudablas/compute/cuda_zher2k.c
index 5b340abd58953d91f2d8a60965ab935173207972..206d0d50dc8e13dbf6aa55b528808088d057a11a 100644
--- a/cudablas/compute/cuda_zher2k.c
+++ b/cudablas/compute/cuda_zher2k.c
@@ -34,12 +34,8 @@ int CUDA_zher2k(MORSE_enum uplo, MORSE_enum trans,
                 cuDoubleComplex *C, int ldc,
                 CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZher2k(CUBLAS_HANDLE
-                 morse_lapack_const(uplo), morse_lapack_const(trans),
+                 morse_cublas_const(uplo), morse_cublas_const(trans),
                  n, k,
                  CUBLAS_VALUE(alpha), A, lda,
                                       B, ldb,
diff --git a/cudablas/compute/cuda_zherk.c b/cudablas/compute/cuda_zherk.c
index 39717408795f1dbd70076d4ccf81b7867888489f..f5eec6d1721efefdce0f5a8ce974887b00c9b1c6 100644
--- a/cudablas/compute/cuda_zherk.c
+++ b/cudablas/compute/cuda_zherk.c
@@ -33,15 +33,11 @@ int CUDA_zherk( MORSE_enum uplo, MORSE_enum trans,
                 cuDoubleComplex *B, int ldb,
                 CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
-    cublasZherk(
-        morse_lapack_const(uplo), morse_lapack_const(trans),
-        n, k,
-        *alpha, A, lda,
-        *beta,  B, ldb);
+    cublasZherk( CUBLAS_HANDLE
+                 morse_cublas_const(uplo), morse_cublas_const(trans),
+                 n, k,
+                 CUBLAS_VALUE(alpha), A, lda,
+                 CUBLAS_VALUE(beta),  B, ldb);
 
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
 
diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c
index c35b555861e140400f223dc7a5d460a75286def8..0faee149e21d22aa9d4092da6aa5485997c5e0a1 100644
--- a/cudablas/compute/cuda_zlarfb.c
+++ b/cudablas/compute/cuda_zlarfb.c
@@ -49,10 +49,6 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
 
     MORSE_enum transT, uplo, notransV, transV;
 
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     /* Check input arguments */
     if ((side != MorseLeft) && (side != MorseRight)) {
         return -1;
@@ -107,23 +103,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
 
         // W = C^H V
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseConjTrans), morse_lapack_const(notransV),
+                     morse_cublas_const(MorseConjTrans), morse_cublas_const(notransV),
                      N, K, M,
                      CUBLAS_SADDR(zone),  C, LDC,
                                           V, LDV,
                      CUBLAS_SADDR(zzero), WORK, LDWORK );
 
         // W = W T^H = C^H V T^H
-        cublasZtrmm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseRight), morse_lapack_const(uplo),
-                     morse_lapack_const(transT), morse_lapack_const(MorseNonUnit),
-                     N, K, CUBLAS_SADDR(zone),
-                     T,    LDT,
-                     WORK, LDWORK);
+        CUDA_ztrmm( MorseRight, uplo, transT, MorseNonUnit,
+                    N, K,
+                    CUBLAS_SADDR(zone), T,    LDT,
+                                        WORK, LDWORK,
+                    CUBLAS_STREAM_VALUE );
 
         // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(notransV), morse_lapack_const(MorseConjTrans),
+                     morse_cublas_const(notransV), morse_cublas_const(MorseConjTrans),
                      M, N, K,
                      CUBLAS_SADDR(mzone), V,    LDV,
                                           WORK, LDWORK,
@@ -135,23 +130,22 @@ CUDA_zlarfb(MORSE_enum side, MORSE_enum trans,
 
         // W = C V
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseNoTrans), morse_lapack_const(notransV),
+                     morse_cublas_const(MorseNoTrans), morse_cublas_const(notransV),
                      M, K, N,
                      CUBLAS_SADDR(zone),  C, LDC,
                                           V, LDV,
                      CUBLAS_SADDR(zzero), WORK, LDWORK );
 
         // W = W T = C V T
-        cublasZtrmm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseRight), morse_lapack_const(uplo),
-                     morse_lapack_const(trans), morse_lapack_const(MorseNonUnit),
-                     M, K, CUBLAS_SADDR(zone),
-                     T,    LDT,
-                     WORK, LDWORK);
+        CUDA_ztrmm( MorseRight, uplo, trans, MorseNonUnit,
+                    M, K,
+                    CUBLAS_SADDR(zone), T,    LDT,
+                                        WORK, LDWORK,
+                    CUBLAS_STREAM_VALUE );
 
         // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
         cublasZgemm( CUBLAS_HANDLE
-                     morse_lapack_const(MorseNoTrans), morse_lapack_const(transV),
+                     morse_cublas_const(MorseNoTrans), morse_cublas_const(transV),
                      M, N, K,
                      CUBLAS_SADDR(mzone), WORK, LDWORK,
                                           V,    LDV,
diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c
index edca0c8d7e13b14995ac2c74857cdfa5106e38ca..e3e29f9569d37451b8741861a238315ed122115b 100644
--- a/cudablas/compute/cuda_zparfb.c
+++ b/cudablas/compute/cuda_zparfb.c
@@ -243,7 +243,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
             transA2 = storev == MorseColumnwise ? MorseNoTrans : MorseConjTrans;
 
             cublasZgemm(CUBLAS_HANDLE
-                        morse_lapack_const(transW), morse_lapack_const(MorseNoTrans),
+                        morse_cublas_const(transW), morse_cublas_const(MorseNoTrans),
                         K, N1, M2,
                         CUBLAS_SADDR(zone),
                         V     /* K*M2  */, LDV,
@@ -253,14 +253,11 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
             if (WORKC == NULL) {
                 /* W = op(T) * W */
-                cublasZtrmm( CUBLAS_HANDLE
-                             morse_lapack_const(MorseLeft), morse_lapack_const(MorseUpper),
-                             morse_lapack_const(trans), morse_lapack_const(MorseNonUnit),
-                             K, N2,
-                             CUBLAS_SADDR(zone),
-                             T,    LDT,
-                             WORK, LDWORK);
-
+                CUDA_ztrmm( MorseLeft, MorseUpper, trans, MorseNonUnit,
+                            K, N2,
+                            CUBLAS_SADDR(zone), T,    LDT,
+                                                WORK, LDWORK,
+                            CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - op(T) * W */
                 for(j = 0; j < N1; j++) {
@@ -272,7 +269,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* A2 = A2 - op(V) * W  */
                 cublasZgemm(CUBLAS_HANDLE
-                            morse_lapack_const(transA2), morse_lapack_const(MorseNoTrans),
+                            morse_cublas_const(transA2), morse_cublas_const(MorseNoTrans),
                             M2, N2, K,
                             CUBLAS_SADDR(mzone), V    /* M2*K  */, LDV,
                                                  WORK /* K*N2  */, LDWORK,
@@ -281,7 +278,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
             } else {
                 /* Wc = V * op(T) */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(transA2), morse_lapack_const(trans),
+                             morse_cublas_const(transA2), morse_cublas_const(trans),
                              M2, K, K,
                              CUBLAS_SADDR(zone),  V, LDV,
                                                   T, LDT,
@@ -289,7 +286,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* A1 = A1 - opt(T) * W */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(trans), morse_lapack_const(MorseNoTrans),
+                             morse_cublas_const(trans), morse_cublas_const(MorseNoTrans),
                              K, N1, K,
                              CUBLAS_SADDR(mzone), T,    LDT,
                                                   WORK, LDWORK,
@@ -297,7 +294,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* A2 = A2 - Wc * W */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(MorseNoTrans), morse_lapack_const(MorseNoTrans),
+                             morse_cublas_const(MorseNoTrans), morse_cublas_const(MorseNoTrans),
                              M2, N2, K,
                              CUBLAS_SADDR(mzone), WORKC, LDWORKC,
                                                   WORK,  LDWORK,
@@ -328,7 +325,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
             transA2 = storev == MorseColumnwise ? MorseConjTrans : MorseNoTrans;
 
             cublasZgemm(CUBLAS_HANDLE
-                        morse_lapack_const(MorseNoTrans), morse_lapack_const(transW),
+                        morse_cublas_const(MorseNoTrans), morse_cublas_const(transW),
                         M1, K, N2,
                         CUBLAS_SADDR(zone), A2   /* M1*N2 */, LDA2,
                                             V    /* N2*K  */, LDV,
@@ -336,14 +333,11 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
             if (WORKC == NULL) {
                 /* W = W * op(T) */
-                cublasZtrmm( CUBLAS_HANDLE
-                             morse_lapack_const(MorseRight), morse_lapack_const(MorseUpper),
-                             morse_lapack_const(trans), morse_lapack_const(MorseNonUnit),
-                             M2, K,
-                             CUBLAS_SADDR(zone),
-                             T,    LDT,
-                             WORK, LDWORK);
-
+                CUDA_ztrmm( MorseRight, MorseUpper, trans, MorseNonUnit,
+                            M2, K,
+                            CUBLAS_SADDR(zone), T,    LDT,
+                                                WORK, LDWORK,
+                            CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - W * op(T) */
                 for(j = 0; j < K; j++) {
@@ -355,7 +349,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* A2 = A2 - W * op(V)  */
                 cublasZgemm(CUBLAS_HANDLE
-                            morse_lapack_const(MorseNoTrans), morse_lapack_const(transA2),
+                            morse_cublas_const(MorseNoTrans), morse_cublas_const(transA2),
                             M2, N2, K,
                             CUBLAS_SADDR(mzone), WORK /* M2*K  */, LDWORK,
                                                  V    /* K*N2  */, LDV,
@@ -364,7 +358,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
             } else {
                 /* A1 = A1 - W * opt(T) */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(MorseNoTrans), morse_lapack_const(trans),
+                             morse_cublas_const(MorseNoTrans), morse_cublas_const(trans),
                              M1, K, K,
                              CUBLAS_SADDR(mzone), WORK, LDWORK,
                                                   T,    LDT,
@@ -372,7 +366,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* Wc = op(T) * V */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(trans), morse_lapack_const(transA2),
+                             morse_cublas_const(trans), morse_cublas_const(transA2),
                              K, N2, K,
                              CUBLAS_SADDR(zone),  T,     LDT,
                                                   V,     LDV,
@@ -380,7 +374,7 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
 
                 /* A2 = A2 - W * Wc */
                 cublasZgemm( CUBLAS_HANDLE
-                             morse_lapack_const(MorseNoTrans), morse_lapack_const(MorseNoTrans),
+                             morse_cublas_const(MorseNoTrans), morse_cublas_const(MorseNoTrans),
                              M2, N2, K,
                              CUBLAS_SADDR(mzone), WORK,  LDWORK,
                                                   WORKC, LDWORKC,
diff --git a/cudablas/compute/cuda_zsymm.c b/cudablas/compute/cuda_zsymm.c
index 93ec3f386942d33cee4250721be2a794679e581a..5213565edb79207844345170f9de2cb8549a7d1b 100644
--- a/cudablas/compute/cuda_zsymm.c
+++ b/cudablas/compute/cuda_zsymm.c
@@ -34,12 +34,8 @@ int CUDA_zsymm(MORSE_enum side, MORSE_enum uplo,
                cuDoubleComplex *C, int ldc,
                CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZsymm(CUBLAS_HANDLE
-                morse_lapack_const(side), morse_lapack_const(uplo),
+                morse_cublas_const(side), morse_cublas_const(uplo),
                 m, n,
                 CUBLAS_VALUE(alpha), A, lda,
                                      B, ldb,
diff --git a/cudablas/compute/cuda_zsyr2k.c b/cudablas/compute/cuda_zsyr2k.c
index 839ea2d68774c3a24f746048f2d9737d53640b85..c80babbad1e968f721881b630a18515ce25de51a 100644
--- a/cudablas/compute/cuda_zsyr2k.c
+++ b/cudablas/compute/cuda_zsyr2k.c
@@ -35,12 +35,8 @@ int CUDA_zsyr2k(
         cuDoubleComplex *C, int ldc,
         CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZsyr2k(CUBLAS_HANDLE
-                 morse_lapack_const(uplo), morse_lapack_const(trans),
+                 morse_cublas_const(uplo), morse_cublas_const(trans),
                  n, k,
                  CUBLAS_VALUE(alpha), A, lda,
                                       B, ldb,
diff --git a/cudablas/compute/cuda_zsyrk.c b/cudablas/compute/cuda_zsyrk.c
index e988fbc41d9f56abd0cc73f8d42190fdea81bef0..11d18ea1556b15b3511ad20b50451c5cd76c06d3 100644
--- a/cudablas/compute/cuda_zsyrk.c
+++ b/cudablas/compute/cuda_zsyrk.c
@@ -33,12 +33,8 @@ int CUDA_zsyrk(MORSE_enum uplo, MORSE_enum trans,
                cuDoubleComplex *C, int ldc,
                CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZsyrk(CUBLAS_HANDLE
-                morse_lapack_const(uplo), morse_lapack_const(trans),
+                morse_cublas_const(uplo), morse_cublas_const(trans),
                 n, k,
                 CUBLAS_VALUE(alpha), A, lda,
                 CUBLAS_VALUE(beta),  C, ldc);
diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c
index d86fa5267fe483ab5517d393aa93913f1bf7d512..349a6f98a02597c81b7dbec7cf395b3cd8498ffc 100644
--- a/cudablas/compute/cuda_ztrmm.c
+++ b/cudablas/compute/cuda_ztrmm.c
@@ -34,17 +34,29 @@ int CUDA_ztrmm(
         cuDoubleComplex *B, int ldb,
         CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
 
-    cublasZtrmm(CUBLAS_HANDLE
-        morse_lapack_const(side), morse_lapack_const(uplo),
-        morse_lapack_const(transa), morse_lapack_const(diag),
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+
+    cublasZtrmm(
+        CUBLAS_HANDLE
+        morse_cublas_const(side), morse_cublas_const(uplo),
+        morse_cublas_const(transa), morse_cublas_const(diag),
         m, n,
         CUBLAS_VALUE(alpha), A, lda,
+        B, ldb,
         B, ldb);
 
+#else
+
+    cublasZtrmm(
+        CUBLAS_HANDLE
+        morse_cublas_const(side), morse_cublas_const(uplo),
+        morse_cublas_const(transa), morse_cublas_const(diag),
+        m, n,
+        CUBLAS_VALUE(alpha), A, lda,
+                             B, ldb);
+#endif
+
     assert( CUBLAS_STATUS_SUCCESS == cublasGetError() );
 
     return MORSE_SUCCESS;
diff --git a/cudablas/compute/cuda_ztrsm.c b/cudablas/compute/cuda_ztrsm.c
index d82766bc8e7710a3eb65aac2e2ae57e3427bf9a6..ae485b751b79ea71b04fbe9ce34c3aab08d97b36 100644
--- a/cudablas/compute/cuda_ztrsm.c
+++ b/cudablas/compute/cuda_ztrsm.c
@@ -33,13 +33,9 @@ int CUDA_ztrsm(MORSE_enum side, MORSE_enum uplo,
                cuDoubleComplex *B, int ldb,
                CUBLAS_STREAM_PARAM)
 {
-#if !defined(CHAMELEON_USE_CUBLAS_V2)
-    cublasSetKernelStream( stream );
-#endif
-
     cublasZtrsm(CUBLAS_HANDLE
-        morse_lapack_const(side), morse_lapack_const(uplo),
-        morse_lapack_const(transa), morse_lapack_const(diag),
+        morse_cublas_const(side), morse_cublas_const(uplo),
+        morse_cublas_const(transa), morse_cublas_const(diag),
         m, n,
         CUBLAS_VALUE(alpha), A, lda,
         B, ldb);
diff --git a/cudablas/compute/cudaglobal.c b/cudablas/compute/cudaglobal.c
new file mode 100644
index 0000000000000000000000000000000000000000..597501e9e45318c282226525dc2367afece94873
--- /dev/null
+++ b/cudablas/compute/cudaglobal.c
@@ -0,0 +1,127 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University of
+ *                          Tennessee Research Foundation.  All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+/**
+ *
+ * @file cudaglobal.c
+ *
+ *  MORSE auxiliary routines
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2017-04-06
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+/*******************************************************************************
+ *  LAPACK Constants
+ **/
+int morse_cublas_constants[] =
+{
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 100
+    0,                      // 101: MorseRowMajor
+    0,                      // 102: MorseColMajor
+    0, 0, 0, 0, 0, 0, 0, 0,
+    CUBLAS_OP_N,            // 111: MorseNoTrans
+    CUBLAS_OP_T,            // 112: MorseTrans
+    CUBLAS_OP_C,            // 113: MorseConjTrans
+    0, 0, 0, 0, 0, 0, 0,
+    CUBLAS_FILL_MODE_UPPER, // 121: MorseUpper
+    CUBLAS_FILL_MODE_LOWER, // 122: MorseLower
+    0,                      // 123: MorseUpperLower
+    0, 0, 0, 0, 0, 0, 0,
+    CUBLAS_DIAG_NON_UNIT,   // 131: MorseNonUnit
+    CUBLAS_DIAG_UNIT,       // 132: MorseUnit
+    0, 0, 0, 0, 0, 0, 0, 0,
+    CUBLAS_SIDE_LEFT,       // 141: MorseLeft
+    CUBLAS_SIDE_RIGHT,      // 142: MorseRight
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 151:
+    0,                      // 152:
+    0,                      // 153:
+    0,                      // 154:
+    0,                      // 155:
+    0,                      // 156:
+    0,                      // 157: MorseEps
+    0,                      // 158:
+    0,                      // 159:
+    0,                      // 160:
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 171: MorseOneNorm
+    0,                      // 172: MorseRealOneNorm
+    0,                      // 173: MorseTwoNorm
+    0,                      // 174: MorseFrobeniusNorm
+    0,                      // 175: MorseInfNorm
+    0,                      // 176: MorseRealInfNorm
+    0,                      // 177: MorseMaxNorm
+    0,                      // 178: MorseRealMaxNorm
+    0,                      // 179
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 200
+    0,                      // 201: MorseDistUniform
+    0,                      // 202: MorseDistSymmetric
+    0,                      // 203: MorseDistNormal
+    0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 240
+    0,                      // 241 MorseHermGeev
+    0,                      // 242 MorseHermPoev
+    0,                      // 243 MorseNonsymPosv
+    0,                      // 244 MorseSymPosv
+    0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 290
+    0,                      // 291 MorseNoPacking
+    0,                      // 292 MorsePackSubdiag
+    0,                      // 293 MorsePackSupdiag
+    0,                      // 294 MorsePackColumn
+    0,                      // 295 MorsePackRow
+    0,                      // 296 MorsePackLowerBand
+    0,                      // 297 MorsePackUpeprBand
+    0,                      // 298 MorsePackAll
+    0,                      // 299
+    0,                      // 300
+    0,                      // 301 MorseNoVec
+    0,                      // 302 MorseVec
+    0,                      // 303 MorseIvec
+    0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 390
+    0,                      // 391
+    0,                      // 392
+    0, 0, 0, 0, 0, 0, 0, 0,
+    0,                      // 401
+    0,                      // 402
+    0, 0, 0, 0, 0, 0, 0, 0  // Remember to add a coma!
+};
diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h
index d6c2004fe9bfa69e0b7ddb1fe9accb2f23f521e1..b181fc8332a36c1024f80c97d175e29f4ad1e0ae 100644
--- a/cudablas/include/cudablas.h
+++ b/cudablas/include/cudablas.h
@@ -41,7 +41,9 @@
 
 #if defined(CHAMELEON_USE_CUBLAS_V2)
 
+#include <cublas.h>
 #include <cublas_v2.h>
+
 #define CUBLAS_STREAM_PARAM cublasHandle_t handle
 #define CUBLAS_STREAM_VALUE handle
 #define CUBLAS_HANDLE handle,
@@ -96,4 +98,12 @@
 extern char *morse_lapack_constants[];
 #define morse_lapack_const(morse_const) morse_lapack_constants[morse_const][0]
 
+extern int morse_cublas_constants[];
+
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+#define morse_cublas_const(morse_const) morse_cublas_constants[morse_const]
+#else
+#define morse_cublas_const(morse_const) morse_lapack_constants[morse_const][0]
+#endif
+
 #endif
diff --git a/runtime/starpu/codelets/codelet_zgelqt.c b/runtime/starpu/codelets/codelet_zgelqt.c
index af571f2ae79f096aa8efc680c16957b334fa8c48..22355977823a4df1aa73e905cc7db4d158025d1f 100644
--- a/runtime/starpu/codelets/codelet_zgelqt.c
+++ b/runtime/starpu/codelets/codelet_zgelqt.c
@@ -166,7 +166,6 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
     cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
     int lda, ldt;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
 
@@ -186,15 +185,14 @@ static void cl_zgelqt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + chameleon_max(m,n);
     h_D   = h_W   + ib*ib;
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_zgelqt(
             m, n, ib,
             d_A, lda, h_A, ib,
             d_T, ldt, h_T, ib,
             d_D, h_D, ib, h_TAU,
-            h_W, d_W, stream);
+            h_W, d_W, stream );
 
     cudaThreadSynchronize();
 }
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 292e071de10a48b70fb34fa6e91ad2dfdc3c7de4..90fe880a741d756713ef0083fd4a73a83f1e225f 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -148,14 +148,13 @@ static void cl_zgemm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &transA, &transB, &m, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream( stream );
 
     CUDA_zgemm(
         transA, transB,
diff --git a/runtime/starpu/codelets/codelet_zgeqrt.c b/runtime/starpu/codelets/codelet_zgeqrt.c
index 50769b3be7323b6342f68889c1138d6d4950d1b2..868c10c16064001632a036ec7b5a7791f6fbb0fa 100644
--- a/runtime/starpu/codelets/codelet_zgeqrt.c
+++ b/runtime/starpu/codelets/codelet_zgeqrt.c
@@ -166,7 +166,6 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A, *h_T, *h_D, *h_W, *h_TAU;
     cuDoubleComplex *d_A, *d_T, *d_D, *d_W;
     int lda, ldt;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda, &ldt, &h_work);
 
@@ -186,8 +185,7 @@ static void cl_zgeqrt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + chameleon_max(m,n);
     h_D   = h_W   + ib*ib;
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_zgeqrt(
             m, n, ib,
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index df45234630fbd77f77f7e976eea77cba868c305b..97f0adcc5b3fbf1c275e5feded3fa258b77ed12f 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -119,14 +119,13 @@ static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int LDC;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zhemm(
         side, uplo,
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 9b6b7194db42d17a7fb5eb755efc76ec7c79c09b..7c5509240eb3893fab6e471a2d7beed51159f41c 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -116,14 +116,13 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
     double beta;
     cuDoubleComplex *C;
     int ldc;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zher2k( uplo, trans,
                  n, k, &alpha, A, lda, B, ldb, &beta, C, ldc,
diff --git a/runtime/starpu/codelets/codelet_zherfb.c b/runtime/starpu/codelets/codelet_zherfb.c
index b5ceec04b8ded9bf733c534645ae7a5303294b5b..8b6c5d8d37e80d64da4d526554faaf2d0e67d4be 100644
--- a/runtime/starpu/codelets/codelet_zherfb.c
+++ b/runtime/starpu/codelets/codelet_zherfb.c
@@ -113,9 +113,8 @@ static void cl_zherfb_cuda_func(void *descr[], void *cl_arg)
     int ldc;
     cuDoubleComplex *WORK;
     int ldwork;
-    CUstream stream;
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     A    = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     T    = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index e5ef780218f5cf95f13f6956b971350d38268b8f..0576cef52ac2abcca50dbfc36ed5422746f4c5a1 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -108,13 +108,12 @@ static void cl_zherk_cuda_func(void *descr[], void *cl_arg)
     double beta;
     cuDoubleComplex *C;
     int ldc;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zherk(
         uplo, trans,
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 71e6b86700dde96065d72c0a0c3e43edac347f2c..bb01bfc7bac752da147983e5ac3a7640836d1925 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -119,14 +119,13 @@ static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int LDC;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &LDA, &LDB, &beta, &LDC);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zsymm(
         side, uplo,
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index cafe3887e071c926e91f296b79d97210eaf71c15..aec9d75f5a5f43dc087dfdcbb684cc5b68216b12 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -116,14 +116,13 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &ldb, &beta, &ldc);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zsyr2k( uplo, trans,
                  n, k, &alpha, A, lda, B, ldb, &beta, C, ldc,
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index b368779f50cc7af2dcb8644be56720746a4deb37..265a8922d43779fc8897c4a26e48f6ac3d034ea9 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -109,13 +109,12 @@ static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex beta;
     cuDoubleComplex *C;
     int ldc;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     C = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &lda, &beta, &ldc);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_zsyrk(
         uplo, trans,
diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c
index 5428c9eea8badfdc586898350e80af94497a2f6a..c1344760444deb6f292017ed0252eb0c7d9188df 100644
--- a/runtime/starpu/codelets/codelet_ztpmqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpmqrt.c
@@ -126,7 +126,6 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *B;
     int ldb;
     cuDoubleComplex *W;
-    CUstream stream;
 
     V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -137,7 +136,7 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
                                 &ldv, &ldt, &lda, &ldb );
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_ztpmqrt(
             side, trans, M, N, K, L, ib,
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index ca54848703d4778361831948aaebfff235795779..b002c81709b0811d838670c96d38fbdb94086897 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -113,13 +113,12 @@ static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
     int LDA;
     cuDoubleComplex *B;
     int LDB;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &M, &N, &alpha, &LDA, &LDB);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_ztrmm(
         side, uplo,
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 76903283298f39b36ee4bcfea4263b563a5434ca..288427e82cb5c65809ed9fdfd6bbe48de7adba01 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -135,13 +135,12 @@ static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
     int lda;
     cuDoubleComplex *B;
     int ldb;
-    CUstream stream;
 
     A = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &side, &uplo, &transA, &diag, &m, &n, &alpha, &lda, &ldb);
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
 
     CUDA_ztrsm(
         side, uplo, transA, diag,
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
index 3780308276014e3c7596a79b65d1e32e01df2466..101feebfe80cede6ddb1f759a5883da3a495ce79 100644
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ b/runtime/starpu/codelets/codelet_ztslqt.c
@@ -185,7 +185,6 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
     cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
     int lda1, lda2, ldt;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
 
@@ -204,7 +203,7 @@ static void cl_ztslqt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + chameleon_max(m,n);
     h_D   = h_W   + ib*m;
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
     CUDA_ztslqt(
             m, n, ib,
             d_A1, lda1, d_A2, lda2,
diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c
index 3901295404dcd256073c9e34fa72bcbd2d4dfce0..1ec507ca2f66d682624e9b69a83dfcbb105d0f84 100644
--- a/runtime/starpu/codelets/codelet_ztsmlq.c
+++ b/runtime/starpu/codelets/codelet_ztsmlq.c
@@ -239,7 +239,6 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *W, *WC;
     int ldwork;
     int ldworkc;
-    CUstream stream;
 
     A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -253,8 +252,7 @@ static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
     WC = W + ib * ldwork;
     ldworkc = (side == MorseLeft) ? m1 : ib;
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib,
                       A1, lda1, A2, lda2, V, ldv, T, ldt,
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index ed4f7dbe9303d36cef5385dd33cddb54b9583fd4..c1c177c9279e3e3c71c827fc3601da98663cc264 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -270,7 +270,6 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *W, *WC;
     int ldwork;
     int ldworkc;
-    CUstream stream;
 
     A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -284,8 +283,7 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
     WC = W + ib * (side == MorseLeft ? m1 : n1);
     ldworkc = (side == MorseLeft) ? m2 : ib;
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_ztsmqr(
             side, trans, m1, n1, m2, n2, k, ib,
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
index 880e1ff8a5acf3c455a2d0a4835d092205404e38..4c5d03fe2b4eeb60fa30dd14f9a250f3bfae080c 100644
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ b/runtime/starpu/codelets/codelet_ztsqrt.c
@@ -176,7 +176,6 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *h_A2, *h_T, *h_D, *h_TAU, *h_W;
     cuDoubleComplex *d_A1, *d_A2, *d_T, *d_D, *d_W;
     int lda1, lda2, ldt;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
 
@@ -195,7 +194,7 @@ static void cl_ztsqrt_cuda_func(void *descr[], void *cl_arg)
     h_W   = h_TAU + chameleon_max(m,n);
     h_D   = h_W   + ib*n;
 
-    stream = starpu_cuda_get_local_stream();
+    RUNTIME_getStream(stream);
     CUDA_ztsqrt(
             m, n, ib,
             d_A1, lda1, d_A2, lda2,
diff --git a/runtime/starpu/codelets/codelet_zunmlq.c b/runtime/starpu/codelets/codelet_zunmlq.c
index 6525661f635f020ec7c234e456eeca32ecbd177c..b990da74beadad4b29ab49c6acfc1f4dd0cd413d 100644
--- a/runtime/starpu/codelets/codelet_zunmlq.c
+++ b/runtime/starpu/codelets/codelet_zunmlq.c
@@ -201,7 +201,6 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
     const cuDoubleComplex *A, *T;
     cuDoubleComplex *C, *WORK;
     int lda, ldt, ldc, ldwork;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib,
                                &lda, &ldt, &ldc, &ldwork);
@@ -211,8 +210,7 @@ static void cl_zunmlq_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_zunmlqt(
             side, trans, m, n, k, ib,
diff --git a/runtime/starpu/codelets/codelet_zunmqr.c b/runtime/starpu/codelets/codelet_zunmqr.c
index 2b88c1d3b90206b3acd2cc1a238a0ca578c81beb..d4c6d6d813ae9d948d232908da530c1c963acd3e 100644
--- a/runtime/starpu/codelets/codelet_zunmqr.c
+++ b/runtime/starpu/codelets/codelet_zunmqr.c
@@ -227,7 +227,6 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
     const cuDoubleComplex *A, *T;
     cuDoubleComplex *C, *WORK;
     int lda, ldt, ldc, ldwork;
-    CUstream stream;
 
     starpu_codelet_unpack_args(cl_arg, &side, &trans, &m, &n, &k, &ib,
                                &lda, &ldt, &ldc, &ldwork);
@@ -237,8 +236,7 @@ static void cl_zunmqr_cuda_func(void *descr[], void *cl_arg)
     C    = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     WORK = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
 
-    stream = starpu_cuda_get_local_stream();
-    cublasSetKernelStream( stream );
+    RUNTIME_getStream(stream);
 
     CUDA_zunmqrt(
             side, trans, m, n, k, ib,
diff --git a/runtime/starpu/include/morse_starpu.h b/runtime/starpu/include/morse_starpu.h
index c1980d3b615f7ef49d51d2797195c7851b2471a7..f0649aeaf585e3f45fc4f5a548b8ab0a5d5c8401 100644
--- a/runtime/starpu/include/morse_starpu.h
+++ b/runtime/starpu/include/morse_starpu.h
@@ -45,6 +45,13 @@
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
 #include <starpu_scheduler.h>
 #include <starpu_cuda.h>
+
+#include <cublas.h>
+#include <starpu_cublas.h>
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+#include <cublas_v2.h>
+#include <starpu_cublas_v2.h>
+#endif
 #endif
 
 #include "control/common.h"
@@ -72,6 +79,19 @@ typedef struct starpu_conf starpu_conf_t;
 
 #endif
 
+/*
+ * cuBlasAPI v2 - StarPU enable the support for cublas handle
+ */
+#if defined(CHAMELEON_USE_CUDA) && defined(CHAMELEON_USE_CUBLAS_V2)
+#define RUNTIME_getStream(_stream_)                             \
+    cublasHandle_t _stream_ = starpu_cublas_get_local_handle();
+#else
+#define RUNTIME_getStream(_stream_)                             \
+    cudaStream_t _stream_ = starpu_cuda_get_local_stream();     \
+    cublasSetKernelStream( stream );
+
+#endif
+
 /*
  * Enable codelets names
  */