diff --git a/coreblas/compute/core_ztsmqr.c b/coreblas/compute/core_ztsmqr.c
index 0792655eefe1ba2dc8ca986d7a36d17bed8d16b3..10d4f2d03a98a135c8062c0618996572d8489829 100644
--- a/coreblas/compute/core_ztsmqr.c
+++ b/coreblas/compute/core_ztsmqr.c
@@ -254,10 +254,10 @@ int CORE_ztsmqr(MORSE_enum side, MORSE_enum trans,
         CORE_zparfb(
             side, trans, MorseForward, MorseColumnwise,
             mi, ni, M2, N2, kb, 0,
-            &A1[LDA1*jc+ic], LDA1,
+            A1 + LDA1*jc+ic, LDA1,
             A2, LDA2,
-            &V[LDV*i], LDV,
-            &T[LDT*i], LDT,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
             WORK, LDWORK);
     }
     return MORSE_SUCCESS;
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 7ef91aee08ef11362bc8115316fb7a69a4bd56d8..d05705eb7fa4a50aeed8bb7e00b707e89e7bf5ae 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -32,6 +32,7 @@ set(ZSRC
     cuda_zhemm.c
     cuda_zher2k.c
     cuda_zherk.c
+    cuda_zlarfb.c
     cuda_zparfb.c
     cuda_zsymm.c
     cuda_zsyr2k.c
@@ -40,6 +41,8 @@ set(ZSRC
     cuda_ztrsm.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    cuda_zunmlqt.c
+    cuda_zunmqrt.c
     )
 
 if( CHAMELEON_USE_MAGMA )
@@ -50,15 +53,12 @@ if( CHAMELEON_USE_MAGMA )
     cuda_zgessm.c
     cuda_zgetrf.c
     cuda_zlauum.c
-    cuda_zparfb.c
     cuda_zpotrf.c
     cuda_zssssm.c
     cuda_ztrtri.c
     cuda_ztslqt.c
     cuda_ztsqrt.c
     cuda_ztstrf.c
-    cuda_zunmlqt.c
-    cuda_zunmqrt.c
     )
 endif()
 
diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c
index a234d2f2178f2edea85f413e08db80bb0ec24c1a..e104a085fcccd05a3f4b6746ec98f6713f97025d 100644
--- a/cudablas/compute/cuda_ztsmqr.c
+++ b/cudablas/compute/cuda_ztsmqr.c
@@ -132,7 +132,6 @@ int CUDA_ztsmqr(
             ni = N1 - i;
             jc = i;
         }
-
         /*
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c
index 55ae68ed671dc94bf7b156146c19e04b090a9529..c9a15473376f72c6d9a0dcee40b2da06e29adf5e 100644
--- a/cudablas/compute/cuda_zunmlqt.c
+++ b/cudablas/compute/cuda_zunmlqt.c
@@ -24,14 +24,14 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
-int CUDA_zunmlqt(
-        magma_side_t side, magma_trans_t trans,
-        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-        const magmaDoubleComplex *A,    magma_int_t LDA,
-        const magmaDoubleComplex *T,    magma_int_t LDT,
-        magmaDoubleComplex *C,    magma_int_t LDC,
-        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+int
+CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans,
+             int M, int N, int K, int IB,
+             const cuDoubleComplex *A,    int LDA,
+             const cuDoubleComplex *T,    int LDT,
+             cuDoubleComplex *C,    int LDC,
+             cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
     int i, kb;
     int i1, i3;
@@ -42,13 +42,13 @@ int CUDA_zunmlqt(
     int mi = M;
 
     /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
+    if ((side != MorseLeft) && (side != MorseRight)) {
         return -1;
     }
     /*
      * NQ is the order of Q and NW is the minimum dimension of WORK
      */
-    if (side == MagmaLeft) {
+    if (side == MorseLeft) {
         nq = M;
         nw = N;
     }
@@ -57,7 +57,7 @@ int CUDA_zunmlqt(
         nw = M;
     }
 
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+    if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
         return -2;
     }
     if (M < 0) {
@@ -84,10 +84,10 @@ int CUDA_zunmlqt(
 
     /* Quick return */
     if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
+        return MORSE_SUCCESS;
 
-    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
-        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
+    if (((side == MorseLeft) && (trans == MorseNoTrans))
+        || ((side == MorseRight) && (trans != MorseNoTrans))) {
         i1 = 0;
         i3 = IB;
     }
@@ -106,7 +106,7 @@ int CUDA_zunmlqt(
     for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
         kb = min(IB, K-i);
 
-        if (side == MagmaLeft) {
+        if (side == MorseLeft) {
             /*
              * H or H' is applied to C(i:m,1:n)
              */
@@ -121,13 +121,13 @@ int CUDA_zunmlqt(
             jc = i;
         }
 
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
+        CUDA_zlarfb( side, trans, MorseForward, MorseRowwise,
+                     mi, ni, kb,
+                     A + LDA * i  + i,  LDA,
+                     T + LDT * i,       LDT,
+                     C + LDC * jc + ic, LDC,
+                     WORK, LDWORK, CUBLAS_STREAM_VALUE);
     }
     return MORSE_SUCCESS;
 }
-#endif
+
diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c
index e868b1e9277f2a333cd2a33ef33f3b6c4a65cd1b..6032cabc4bb04e58a637370078abc19509374433 100644
--- a/cudablas/compute/cuda_zunmqrt.c
+++ b/cudablas/compute/cuda_zunmqrt.c
@@ -24,14 +24,14 @@
  **/
 #include "cudablas/include/cudablas.h"
 
-#if defined(CHAMELEON_USE_MAGMA)
-int CUDA_zunmqrt(
-        magma_side_t side, magma_trans_t trans,
-        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
-        const magmaDoubleComplex *A,    magma_int_t LDA,
-        const magmaDoubleComplex *T,    magma_int_t LDT,
-        magmaDoubleComplex *C,    magma_int_t LDC,
-        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+int
+CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans,
+             int M, int N, int K, int IB,
+             const cuDoubleComplex *A,    int LDA,
+             const cuDoubleComplex *T,    int LDT,
+             cuDoubleComplex *C,    int LDC,
+             cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
     int i, kb;
     int i1, i3;
@@ -42,13 +42,13 @@ int CUDA_zunmqrt(
     int mi = M;
 
     /* Check input arguments */
-    if ((side != MagmaLeft) && (side != MagmaRight)) {
+    if ((side != MorseLeft) && (side != MorseRight)) {
         return -1;
     }
     /*
      * NQ is the order of Q and NW is the minimum dimension of WORK
      */
-    if (side == MagmaLeft) {
+    if (side == MorseLeft) {
         nq = M;
         nw = N;
     }
@@ -57,7 +57,7 @@ int CUDA_zunmqrt(
         nw = M;
     }
 
-    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+    if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
         return -2;
     }
     if (M < 0) {
@@ -84,10 +84,10 @@ int CUDA_zunmqrt(
 
     /* Quick return */
     if ((M == 0) || (N == 0) || (K == 0))
-        return MAGMA_SUCCESS;
+        return MORSE_SUCCESS;
 
-    if (((side == MagmaLeft) && (trans != MagmaNoTrans))
-        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
+    if (((side == MorseLeft) && (trans != MorseNoTrans))
+        || ((side == MorseRight) && (trans == MorseNoTrans))) {
         i1 = 0;
         i3 = IB;
     }
@@ -99,7 +99,7 @@ int CUDA_zunmqrt(
     for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
         kb = min(IB, K-i);
 
-        if (side == MagmaLeft) {
+        if (side == MorseLeft) {
             /*
              * H or H' is applied to C(i:m,1:n)
              */
@@ -114,14 +114,14 @@ int CUDA_zunmqrt(
             jc = i;
         }
 
-        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
-                          mi, ni, kb,
-                          A + LDA * i  + i,  LDA,
-                          T + LDT * i,       LDT,
-                          C + LDC * jc + ic, LDC,
-                          WORK, LDWORK);
+        CUDA_zlarfb( side, trans, MorseForward, MorseColumnwise,
+                     mi, ni, kb,
+                     A + LDA * i  + i,  LDA,
+                     T + LDT * i,       LDT,
+                     C + LDC * jc + ic, LDC,
+                     WORK, LDWORK,
+                     CUBLAS_STREAM_VALUE );
     }
 
     return MORSE_SUCCESS;
 }
-#endif
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
index 064583858fc3ce83c6a51a8285b65e49913f6e30..6f622b44e8c7270f6d97f85a905cdd77bc18245c 100644
--- a/cudablas/include/cudablas_z.h
+++ b/cudablas/include/cudablas_z.h
@@ -54,6 +54,7 @@ int CUDA_zgeqrt( magma_int_t m, magma_int_t n, magma_int_t nb, magmaDoubleComple
 int CUDA_zgessm( char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib, magma_int_t *ipiv, cuDoubleComplex *dL1, magma_int_t lddl1, cuDoubleComplex *dL, magma_int_t lddl, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zgetrf_incpiv( char storev, magma_int_t m, magma_int_t n, magma_int_t ib, cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda, cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl, magma_int_t *ipiv, cuDoubleComplex *dwork, magma_int_t lddwork, magma_int_t *info);
 int CUDA_zgetrf_nopiv( magma_int_t m, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zlarfb(MORSE_enum side, MORSE_enum trans, MORSE_enum direct, MORSE_enum storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zlauum( char uplo, magma_int_t n, cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zpotrf( magma_uplo_t uplo, magma_int_t n, magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
 int CUDA_zssssm( magma_storev_t storev, magma_int_t m1, magma_int_t n1, magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib, magmaDoubleComplex *dA1, magma_int_t ldda1, magmaDoubleComplex *dA2, magma_int_t ldda2, magmaDoubleComplex *dL1, magma_int_t lddl1, magmaDoubleComplex *dL2, magma_int_t lddl2, magma_int_t *IPIV, magma_int_t *info);