diff --git a/cudablas/CMakeLists.txt b/cudablas/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..d8d434c5acde1a1ba2fce5b06d0e7e0b423a623a
--- /dev/null
+++ b/cudablas/CMakeLists.txt
@@ -0,0 +1,35 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+#  @file CMakeLists.txt
+#
+#  @project MORSE
+#  MORSE is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver.
+#
+#  @version 0.9.0
+#  @author Cedric Castagnede
+#  @author Emmanuel Agullo
+#  @author Mathieu Faverge
+#  @date 13-07-2012
+#
+###
+
+add_subdirectory(include)
+add_subdirectory(compute)
+#add_subdirectory(eztrace_module)
+
+###
+### END CMakeLists.txt
+###
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..fea59161b9009f47830cfdac2552df152aefa3d6
--- /dev/null
+++ b/cudablas/compute/CMakeLists.txt
@@ -0,0 +1,69 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+#  @file CMakeLists.txt
+#
+#  @project MORSE
+#  MORSE is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver.
+#
+#  @author Florent Pruvost
+#  @date 16-09-2015
+#
+###
+
+# Generate the morse sources for all possible precisions
+# ------------------------------------------------------
+set(CUDABLAS_SRCS_GENERATED "")
+set(ZSRC
+    cuda_zgelqt.c
+    #cuda_zgemerge.c
+    cuda_zgeqrt.c
+    cuda_zgessm.c
+    cuda_zgetrf.c
+    cuda_zlauum.c
+    #cuda_zparfb.c
+    cuda_zpotrf.c
+    cuda_zssssm.c
+    cuda_ztrtri.c
+    cuda_ztslqt.c
+    cuda_ztsmlq.c
+    cuda_ztsmqr.c
+    cuda_ztsqrt.c
+    cuda_ztstrf.c
+    cuda_zunmlqt.c
+    cuda_zunmqrt.c
+    )
+
+precisions_rules_py(CUDABLAS_SRCS_GENERATED "${ZSRC}"
+                    PRECISIONS "${CHAMELEON_PRECISION}")
+
+set(CUDABLAS_SRCS
+    ${CUDABLAS_SRCS_GENERATED}
+    )
+
+# Compile step
+# ------------
+add_library(cudablas ${CUDABLAS_SRCS})
+add_dependencies(cudablas cudablas_include)
+set_property(TARGET cudablas PROPERTY LINKER_LANGUAGE Fortran)
+
+# installation
+# ------------
+install(TARGETS cudablas
+        DESTINATION lib)
+
+###
+### END CMakeLists.txt
+###
diff --git a/cudablas/compute/cuda_zgelqt.c b/cudablas/compute/cuda_zgelqt.c
new file mode 100644
index 0000000000000000000000000000000000000000..c000530185406e40af79d966cf1a357a11d70bc0
--- /dev/null
+++ b/cudablas/compute/cuda_zgelqt.c
@@ -0,0 +1,172 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zgelqt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zgelqt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da, magma_int_t ldda,
+        magmaDoubleComplex *v,  magma_int_t ldv,
+        magmaDoubleComplex *dt, magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream)
+{
+#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
+#define v_ref(a_1,a_2)  ( v+(a_2)*(ldv) + (a_1))
+#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
+#define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))
+
+    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
+    double _Complex one=1.;
+
+    if (m < 0) {
+    return -1;
+    } else if (n < 0) {
+    return -2;
+    } else if (ldda < max(1,m)) {
+    return -4;
+    }
+
+    k = min(m,n);
+    if (k == 0) {
+    hwork[0] = *((magmaDoubleComplex*) &one);
+    return MAGMA_SUCCESS;
+    }
+
+    lddwork= m;
+
+    /* lower parts of little T must be zero: memset to 0 for simplicity */
+    memset(t_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
+    cudaMemset(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex));
+
+    /* copy first panel of A on the host */
+    cublasGetMatrix(min(m, nb), n, sizeof(magmaDoubleComplex),
+              da_ref(0, 0), ldda,
+              v, ldv);
+
+    /* Use blocked code initially */
+    for (i = 0; i < k; i += nb) {
+
+    ib = min(k-i, nb);
+    if (i+nb >= m) ib = min(m-i, nb);
+    cols = n-i;
+
+    if (i > 0){
+
+      /* copy panel of A from device to host */
+      cublasGetMatrix(ib, n, sizeof(magmaDoubleComplex),
+                      da_ref(i, 0), ldda,
+                      v, ldv);
+
+      /* Apply H' to A(i+2*ib:m, i:n) from the right */
+      rows = m-old_i-2*old_ib;
+      if (rows > 0){
+          magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                            rows, n-old_i, old_ib,
+                            da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
+                            da_ref(old_i+2*old_ib, old_i), ldda,
+                            dwork, lddwork);
+      }
+
+      /* copy the lower diag tile into d_A */
+      CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
+                    dd, ldd, da_ref(old_i, old_i), ldda, stream);
+
+    }
+
+    /* Form the triangular factor of the block reflector on the host
+    H = H'(i+ib-1) . . . H(i+1) H(i) */
+    CORE_zgelqt(ib, cols, ib,
+              (double _Complex*) v_ref(0,i), ldv,
+              (double _Complex*) t_ref(0,0), ldt,
+              (double _Complex*) tau+i,
+              (double _Complex*) hwork);
+
+    if ( i + ib < m ){
+      /* put 0s in the lower triangular part of a panel (and 1s on the
+       diagonal); copy the lower triangular in d */
+      CORE_zgesplit(MorseRight, MorseUnit, ib, min(cols,ib),
+                    (double _Complex*) v_ref(0,i), ldv,
+                    (double _Complex*) d, ldd);
+
+      /* copy from host to device a tile diag */
+      cublasSetMatrix( ib, min(cols,ib), sizeof(magmaDoubleComplex),
+                       d, ldd, dd, ldd );
+    }
+
+    /* Send the triangular factor T to the GPU */
+    cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
+                   t_ref(0,0), ldt, dt_ref(0,i), lddt );
+
+    /* A panel (with zeros in lower tri of its diag) is ready to be used
+    in input of zlarfb_gpu: we send the panel to the gpu */
+    cublasSetMatrix( ib, cols, sizeof(magmaDoubleComplex),
+                   v_ref(0,i), ldv, da_ref(i,i), ldda );
+
+    if (i + ib < m) {
+
+      if (i+2*ib < m){
+          rows = ib;
+      }
+      else{
+          rows = m-i-ib;
+      }
+      /* Apply H' to A(i+ib:i+2*ib, i:n) from the right */
+      magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                        rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
+                        lddt, da_ref(i+ib,i), ldda, dwork, lddwork);
+      old_i = i;
+      old_ib = ib;
+      if (i+nb >= k){
+          /* Apply H' to A(i+2*ib:m, i:n) from the right */
+          rows = m-old_i-2*old_ib;
+          if (rows > 0){
+              magma_zlarfb_gpu( MagmaRight, MagmaNoTrans, MagmaForward, MagmaRowwise,
+                                rows, cols, old_ib,
+                                da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
+                                da_ref(old_i+2*old_ib, old_i), ldda,
+                                dwork, lddwork);
+          }
+          /* copy the upper diag tile into d_A */
+          CUDA_zgemerge(MagmaRight, MagmaUnit, old_ib, old_ib,
+                        dd, ldd, da_ref(old_i, old_i), ldda, stream);
+      }
+    }
+
+    }
+
+#undef da_ref
+#undef v_ref
+#undef dt_ref
+#undef t_ref
+
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zgemerge.c b/cudablas/compute/cuda_zgemerge.c
new file mode 100644
index 0000000000000000000000000000000000000000..2d5a5d53d021d3c27803745fe2ad8d4194c6a3c5
--- /dev/null
+++ b/cudablas/compute/cuda_zgemerge.c
@@ -0,0 +1,137 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zmerge.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+int CUDA_zgemerge(
+        magma_side_t side, magma_diag_t diag,
+        magma_int_t M, magma_int_t N,
+        magmaDoubleComplex *A, magma_int_t LDA,
+        magmaDoubleComplex *B, magma_int_t LDB,
+        CUstream stream)
+{
+    int i, j;
+    magmaDoubleComplex *cola, *colb;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    if (M < 0) {
+        return -1;
+    }
+    if (N < 0) {
+        return -2;
+    }
+    if ( (LDA < max(1,M)) && (M > 0) ) {
+        return -5;
+    }
+    if ( (LDB < max(1,M)) && (M > 0) ) {
+        return -7;
+    }
+
+    if (side == MagmaLeft){
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(handle, i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }else{
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }
+
+    cublasDestroy(handle);
+
+    return MORSE_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
+int CUDA_zgemerge(
+        magma_side_t side, magma_diag_t diag,
+        magma_int_t M, magma_int_t N,
+        magmaDoubleComplex *A, magma_int_t LDA,
+        magmaDoubleComplex *B, magma_int_t LDB,
+        CUstream stream)
+{
+    int i, j;
+    magmaDoubleComplex *cola, *colb;
+
+    if (M < 0) {
+        return -1;
+    }
+    if (N < 0) {
+        return -2;
+    }
+    if ( (LDA < max(1,M)) && (M > 0) ) {
+        return -5;
+    }
+    if ( (LDB < max(1,M)) && (M > 0) ) {
+        return -7;
+    }
+
+    if (side == MagmaLeft){
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }else{
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif
diff --git a/cudablas/compute/cuda_zgeqrt.c b/cudablas/compute/cuda_zgeqrt.c
new file mode 100644
index 0000000000000000000000000000000000000000..e8478ed1c13325d6e9cdfbd215fb36be40cffb8c
--- /dev/null
+++ b/cudablas/compute/cuda_zgeqrt.c
@@ -0,0 +1,292 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zgeqrt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zgeqrt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da, magma_int_t ldda,
+        magmaDoubleComplex *v,  magma_int_t ldv,
+        magmaDoubleComplex *dt, magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream)
+{
+#define da_ref(a_1,a_2) ( da+(a_2)*(ldda) + (a_1))
+#define v_ref(a_1,a_2)  ( v+(a_2)*(ldv) + (a_1))
+#define dt_ref(a_1,a_2) ( dt+(a_2)*(lddt) + (a_1))
+#define t_ref(a_1,a_2)  ( t+(a_2)*(ldt) + (a_1))
+
+    int i, k, ib, lddwork, old_i, old_ib, rows, cols;
+    double _Complex one=1.;
+
+//    int lwkopt = n * nb;
+//    hwork[0] = *((magmaDoubleComplex*) &lwkopt);
+
+    if (m < 0) {
+        return -1;
+    } else if (n < 0) {
+        return -2;
+    } else if (ldda < max(1,m)) {
+        return -4;
+    }
+
+    k = min(m,n);
+    if (k == 0) {
+        hwork[0] = *((magmaDoubleComplex*) &one);
+        return MAGMA_SUCCESS;
+    }
+
+    lddwork= k;
+
+    /* lower parts of little T must be zero: memset to 0 for simplicity */
+    memset(t_ref(0,0), 0, nb*nb*sizeof(magmaDoubleComplex));
+    cudaMemsetAsync(dt_ref(0,0), 0, nb*n*sizeof(magmaDoubleComplex), stream);
+
+    /* copy first panel of A on the host */
+//  cublasGetMatrix(m, min(nb,n), sizeof(magmaDoubleComplex),
+//                    da_ref(0, 0), ldda,
+//                    v, ldv);
+    cudaMemcpy( v, da_ref(0,0),
+                m*min(nb,n)*sizeof(magmaDoubleComplex),
+                cudaMemcpyDeviceToHost);
+
+    /* Use blocked code initially */
+    for (i = 0; i < k; i += nb) {
+
+        ib = min(k-i, nb);
+        if (i+nb>=n) ib = min(n-i, nb);
+        rows = m-i;
+
+        if (i>0){
+
+            /* copy panel of A from device to host */
+//          cublasGetMatrix(m, ib, sizeof(magmaDoubleComplex),
+//                            da_ref(0, i), ldda,
+//                            v, ldv);
+            /* copy panel of A from device to host */
+            cudaMemcpy( v, da_ref(0,i),
+                        m*ib*sizeof(magmaDoubleComplex),
+                        cudaMemcpyDeviceToHost);
+
+            /* Apply H' to A(i:m,i+2*ib:n) from the left */
+            cols = n-old_i-2*old_ib;
+            if (cols > 0){
+                magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
+                                  m-old_i, cols, old_ib,
+                                  da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
+                                  da_ref(old_i, old_i+2*old_ib), ldda,
+                                  dwork, cols);
+            }
+
+            /* copy the upper diag tile into d_A */
+            CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib,
+                          dd, ldd, da_ref(old_i, old_i), ldda, stream);
+
+        }
+
+        /* Form the triangular factor of the block reflector on the host
+         H = H(i) H(i+1) . . . H(i+ib-1) */
+        CORE_zgeqrt(rows, ib, ib,
+                    (double _Complex*) v_ref(i,0), ldv,
+                    (double _Complex*) t_ref(0,0), ldt,
+                    (double _Complex*) tau+i,
+                    (double _Complex*) hwork);
+
+        if ( i + ib < n ){
+            /* put 0s in the upper triangular part of a panel (and 1s on the
+             diagonal); copy the upper triangular in d */
+            CORE_zgesplit(MorseLeft, MorseUnit, min(rows,ib), ib,
+                          (double _Complex*) v_ref(i,0), ldv,
+                          (double _Complex*) d, ldd);
+
+            /* copy from host to device a tile diag */
+            cublasSetMatrix( min(rows,ib), ib, sizeof(magmaDoubleComplex),
+                             d, ldd, dd, ldd );
+        }
+
+        /* Send the triangular factor T to the GPU */
+        cublasSetMatrix( ib, ib, sizeof(magmaDoubleComplex),
+                         t_ref(0,0), ldt, dt_ref(0,i), lddt );
+
+        /* A panel (with zeros in upper tri of its diag) is ready to be used
+         in input of zlarfb_gpu: we send the panel to the gpu */
+        cublasSetMatrix( rows, ib, sizeof(magmaDoubleComplex),
+                         v_ref(i,0), ldv, da_ref(i,i), ldda );
+
+        if (i + ib < n) {
+
+            if (i+2*ib < n){
+                cols = ib;
+            }
+            else{
+                cols = n-i-ib;
+            }
+            /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
+            magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
+                              rows, cols, ib, da_ref(i,i), ldda, dt_ref(0,i),
+                              lddt, da_ref(i,i+ib), ldda, dwork, cols);
+            old_i = i;
+            old_ib = ib;
+            if (i+nb>=k){
+                /* Apply H' to A(i:m,i+2*ib:n) from the left */
+                cols = n-old_i-2*old_ib;
+                if (cols > 0){
+                    magma_zlarfb_gpu( MagmaLeft, MagmaConjTrans, MagmaForward, MagmaColumnwise,
+                                      rows, cols, old_ib,
+                                      da_ref(old_i, old_i), ldda, dt_ref(0,old_i), lddt,
+                                      da_ref(old_i, old_i+2*old_ib), ldda,
+                                      dwork, cols);
+                }
+                /* copy the upper diag tile into d_A */
+                CUDA_zgemerge(MagmaLeft, MagmaUnit, old_ib, old_ib,
+                              dd, ldd, da_ref(old_i, old_i), ldda, stream);
+            }
+        }
+
+    }
+
+#undef da_ref
+#undef v_ref
+#undef dt_ref
+#undef t_ref
+
+    return MORSE_SUCCESS;
+}
+
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+int CUDA_zgemerge(
+        magma_side_t side, magma_diag_t diag,
+        magma_int_t M, magma_int_t N,
+        magmaDoubleComplex *A, magma_int_t LDA,
+        magmaDoubleComplex *B, magma_int_t LDB,
+        CUstream stream)
+{
+    int i, j;
+    magmaDoubleComplex *cola, *colb;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    if (M < 0) {
+        return -1;
+    }
+    if (N < 0) {
+        return -2;
+    }
+    if ( (LDA < max(1,M)) && (M > 0) ) {
+        return -5;
+    }
+    if ( (LDB < max(1,M)) && (M > 0) ) {
+        return -7;
+    }
+
+    if (side == MagmaLeft){
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(handle, i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }else{
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(handle, M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }
+
+    cublasDestroy(handle);
+
+    return MORSE_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
+int CUDA_zgemerge(
+        magma_side_t side, magma_diag_t diag,
+        magma_int_t M, magma_int_t N,
+        magmaDoubleComplex *A, magma_int_t LDA,
+        magmaDoubleComplex *B, magma_int_t LDB,
+        CUstream stream)
+{
+    int i, j;
+    magmaDoubleComplex *cola, *colb;
+
+    if (M < 0) {
+        return -1;
+    }
+    if (N < 0) {
+        return -2;
+    }
+    if ( (LDA < max(1,M)) && (M > 0) ) {
+        return -5;
+    }
+    if ( (LDB < max(1,M)) && (M > 0) ) {
+        return -7;
+    }
+
+    if (side == MagmaLeft){
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(i+1, cola, 1, colb, 1);
+            cudaMemcpyAsync(colb , cola,
+                            (i+1)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }else{
+        for(i=0; i<N; i++){
+            cola = A + i*LDA;
+            colb = B + i*LDB;
+//            cublasZcopy(M-i, cola + i, 1, colb + i, 1);
+            cudaMemcpyAsync(colb+i , cola+i,
+                            (M-i)*sizeof(cuDoubleComplex),
+                            cudaMemcpyDeviceToDevice, stream);
+        }
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif
diff --git a/cudablas/compute/cuda_zgessm.c b/cudablas/compute/cuda_zgessm.c
new file mode 100644
index 0000000000000000000000000000000000000000..d8c0198ec0675259b6a128450943f04918aa6e9d
--- /dev/null
+++ b/cudablas/compute/cuda_zgessm.c
@@ -0,0 +1,57 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zgessm.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
+int CUDA_zgessm(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib,
+        magma_int_t *ipiv,
+        cuDoubleComplex *dL1, magma_int_t lddl1,
+        cuDoubleComplex *dL,  magma_int_t lddl,
+        cuDoubleComplex *dA,  magma_int_t ldda,
+        magma_int_t *info)
+{
+    int ret;
+    /* The kernel is just using the inverted part or nothing */
+    if ( lddl1 >= 2*ib ) {
+      dL1 += ib;
+      ret = magma_zgessm_gpu( storev, m, n, k, ib,
+                  ipiv, dL1, lddl1, dL, lddl, dA, ldda, info );
+    }
+    else {
+      ret = magma_zgessm_gpu( storev, m, n, k, ib,
+              ipiv, NULL, 1, dL, lddl, dA, ldda, info );
+    }
+
+    if (ret != MAGMA_SUCCESS) {
+        fprintf(stderr, "Error in MAGMA: %d\n", ret);
+        exit(-1);
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif
+#endif
diff --git a/cudablas/compute/cuda_zgetrf.c b/cudablas/compute/cuda_zgetrf.c
new file mode 100644
index 0000000000000000000000000000000000000000..dc6d18431c5bf3a6603489d7a9ae5d1e93982dd6
--- /dev/null
+++ b/cudablas/compute/cuda_zgetrf.c
@@ -0,0 +1,54 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zgetrf.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
+int CUDA_zgetrf_incpiv(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t ib,
+        cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda,
+        cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl,
+        magma_int_t *ipiv,
+        cuDoubleComplex *dwork, magma_int_t lddwork,
+        magma_int_t *info)
+{
+    magma_zgetrf_incpiv_gpu( storev, m, n, ib,
+                             hA, ldha, dA, ldda,
+                             hL, ldhl,  dL, lddl,
+                             ipiv,
+                             dwork, lddwork,
+                             info );
+    return MORSE_SUCCESS;
+}
+#endif
+int CUDA_zgetrf_nopiv(
+        magma_int_t m, magma_int_t n,
+        cuDoubleComplex *dA, magma_int_t ldda,
+        magma_int_t *info)
+{
+    magma_zgetrf_nopiv_gpu( m, n, dA, ldda, info );
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zlauum.c b/cudablas/compute/cuda_zlauum.c
new file mode 100644
index 0000000000000000000000000000000000000000..2e0467c182863ca3ed1f0b1137aa44c027bb5f72
--- /dev/null
+++ b/cudablas/compute/cuda_zlauum.c
@@ -0,0 +1,40 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zlauum.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zlauum(
+        char uplo, magma_int_t n,
+        cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
+{
+    int ret;
+    ret = magma_zlauum_gpu( uplo, n, dA, ldda, info);
+    if (ret != MAGMA_SUCCESS) {
+        fprintf(stderr, "Error in MAGMA: %d\n", ret);
+        exit(-1);
+    }
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c
new file mode 100644
index 0000000000000000000000000000000000000000..ce1104d0971c893eee9266d69bda7f5b313dec30
--- /dev/null
+++ b/cudablas/compute/cuda_zparfb.c
@@ -0,0 +1,581 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zparfb.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+int CUDA_zparfb(
+        magma_side_t side, magma_trans_t trans,
+        magma_direct_t direct, magma_storev_t storev,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t L,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK,
+        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream)
+
+{
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
+    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
+#else
+    double zzero = 0.0;
+    double zone  = 1.0;
+    double mzone = -1.0;
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+    int j;
+    magma_trans_t transW;
+    magma_trans_t transA2;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasOperation_t cublasTrans;
+    cublasOperation_t cublasTransW;
+    cublasOperation_t cublasTransA2;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    if (trans == MagmaNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MagmaTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MagmaConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in CUDA_zparfb: bad trans parameter %d\n", trans);
+    }
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
+        return -3;
+    }
+    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
+        return -4;
+    }
+    if (M1 < 0) {
+        return -5;
+    }
+    if (N1 < 0) {
+        return -6;
+    }
+    if ((M2 < 0) ||
+        ( (side == MagmaRight) && (M1 != M2) ) ) {
+        return -7;
+    }
+    if ((N2 < 0) ||
+        ( (side == MagmaLeft) && (N1 != N2) ) ) {
+        return -8;
+    }
+    if (K < 0) {
+        return -9;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (direct == MagmaForward) {
+
+        if (side == MagmaLeft) {
+
+            /*
+             * Column or Rowwise / Forward / Left
+             * ----------------------------------
+             *
+             * Form  H * A  or  H' * A  where  A = ( A1 )
+             *                                     ( A2 )
+             */
+
+            /*
+             * W = A1 + V' * A2:
+             *      W = A1
+             *      W = W + V' * A2
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               K * sizeof(cuDoubleComplex), N1,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW);
+            }
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, cublasTransW, CUBLAS_OP_N,
+                        K, N1, M2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)V     /* K*M2  */, LDV,
+                        (const cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = op(T) * W */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    K, N2,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - op(T) * W */
+                for(j = 0; j < N1; j++) {
+                    cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone,
+                                (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - op(V) * W  */
+                cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)V     /* M2*K  */, LDV,
+                            (const cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
+
+            } else {
+                /* Wc = V * op(T) */
+                cublasZgemm( handle, cublasTransA2, cublasTrans,
+                             M2, K, K,
+                             (const cuDoubleComplex *) &zone,  V,     LDV,
+                                    T,     LDT,
+                             (const cuDoubleComplex *) &zzero, WORKC, LDWORKC );
+
+                /* A1 = A1 - opt(T) * W */
+                cublasZgemm( handle, cublasTrans, CUBLAS_OP_N,
+                             K, N1, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)T,    LDT,
+                             (const cuDoubleComplex *)WORK, LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex*)A1,   LDA1 );
+
+                /* A2 = A2 - Wc * W */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+        else {
+            /*
+             * Column or Rowwise / Forward / Right
+             * -----------------------------------
+             *
+             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
+             *
+             */
+
+            /*
+             * W = A1 + A2 * V':
+             *      W = A1
+             *      W = W + A2 * V'
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               M1 * sizeof(cuDoubleComplex), K,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW);
+            }
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, CUBLAS_OP_N, cublasTransW,
+                        M1, K, N2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
+                        (const cuDoubleComplex*)V     /* N2*K  */, LDV,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = W * op(T) */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    M2, K,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - W * op(T) */
+                for(j = 0; j < K; j++) {
+                    cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone,
+                        (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                        (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - W * op(V)  */
+                cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
+                            (const cuDoubleComplex*)V     /* K*N2  */, LDV,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
+
+            } else {
+                /* A1 = A1 - W * opt(T) */
+                cublasZgemm( handle, CUBLAS_OP_N, cublasTrans,
+                    M1, K, K,
+                    (const cuDoubleComplex *) &mzone,
+                    (const cuDoubleComplex *)WORK, LDWORK,
+                    (const cuDoubleComplex *)T,    LDT,
+                    (const cuDoubleComplex *) &zone,
+                    (cuDoubleComplex *)A1,   LDA1 );
+
+                /* Wc = op(T) * V */
+                cublasZgemm( handle, cublasTrans, cublasTransA2,
+                             K, N2, K,
+                             (const cuDoubleComplex *) &zone,
+                             (const cuDoubleComplex *)T,     LDT,
+                             (const cuDoubleComplex *)V,     LDV,
+                             (const cuDoubleComplex *) &zzero,
+                             (cuDoubleComplex *)WORKC, LDWORKC );
+
+                /* A2 = A2 - W * Wc */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+    }
+    else {
+        fprintf(stderr, "Not implemented (Backward / Left or Right)");
+        return MAGMA_ERR_NOT_SUPPORTED;
+    }
+
+    cublasDestroy(handle);
+
+    return MORSE_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
+magma_int_t
+CUDA_zparfb(magma_side_t side, magma_trans_t trans,
+                 magma_direct_t direct, magma_storev_t storev,
+                 magma_int_t M1, magma_int_t N1,
+                 magma_int_t M2, magma_int_t N2,
+                 magma_int_t K, magma_int_t L,
+                       magmaDoubleComplex *A1, magma_int_t LDA1,
+                       magmaDoubleComplex *A2, magma_int_t LDA2,
+                 const magmaDoubleComplex *V, magma_int_t LDV,
+                 const magmaDoubleComplex *T, magma_int_t LDT,
+                       magmaDoubleComplex *WORK, magma_int_t LDWORK,
+                       magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+                       CUstream stream)
+{
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
+    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
+#else
+    double zzero = 0.0;
+    double zone  = 1.0;
+    double mzone = -1.0;
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+    int j;
+    magma_trans_t transW;
+    magma_trans_t transA2;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
+        return -3;
+    }
+    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
+        return -4;
+    }
+    if (M1 < 0) {
+        return -5;
+    }
+    if (N1 < 0) {
+        return -6;
+    }
+    if ((M2 < 0) ||
+        ( (side == MagmaRight) && (M1 != M2) ) ) {
+        return -7;
+    }
+    if ((N2 < 0) ||
+        ( (side == MagmaLeft) && (N1 != N2) ) ) {
+        return -8;
+    }
+    if (K < 0) {
+        return -9;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (direct == MagmaForward) {
+
+        if (side == MagmaLeft) {
+
+            /*
+             * Column or Rowwise / Forward / Left
+             * ----------------------------------
+             *
+             * Form  H * A  or  H' * A  where  A = ( A1 )
+             *                                     ( A2 )
+             */
+
+            /*
+             * W = A1 + V' * A2:
+             *      W = A1
+             *      W = W + V' * A2
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               K * sizeof(cuDoubleComplex), N1,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+
+            cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans),
+                        K, N1, M2,
+                        zone,
+                        (cuDoubleComplex*)V     /* K*M2  */, LDV,
+                        (cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
+                        zone,
+                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = op(T) * W */
+                cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper),
+                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
+                             K, N2,
+                             zone,
+                             (cuDoubleComplex*)T, LDT,
+                             (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - op(T) * W */
+                for(j = 0; j < N1; j++) {
+                    cublasZaxpy(K, mzone,
+                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - op(V) * W  */
+                cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans),
+                            M2, N2, K,
+                            mzone,
+                            (cuDoubleComplex*)V     /* M2*K  */, LDV,
+                            (cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
+                            zone,
+                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
+
+            } else {
+                /* Wc = V * op(T) */
+                cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans),
+                             M2, K, K,
+                             zone,  V,     LDV,
+                                    T,     LDT,
+                             zzero, WORKC, LDWORKC );
+
+                /* A1 = A1 - opt(T) * W */
+                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans),
+                             K, N1, K,
+                             mzone, T,    LDT,
+                                    WORK, LDWORK,
+                             zone,  A1,   LDA1 );
+
+                /* A2 = A2 - Wc * W */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
+                             M2, N2, K,
+                             mzone, WORKC, LDWORKC,
+                                    WORK,  LDWORK,
+                             zone,  A2,    LDA2 );
+            }
+        }
+        else {
+            /*
+             * Column or Rowwise / Forward / Right
+             * -----------------------------------
+             *
+             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
+             *
+             */
+
+            /*
+             * W = A1 + A2 * V':
+             *      W = A1
+             *      W = W + A2 * V'
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               M1 * sizeof(cuDoubleComplex), K,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+
+            cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW),
+                        M1, K, N2,
+                        zone,
+                        (cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
+                        (cuDoubleComplex*)V     /* N2*K  */, LDV,
+                        zone,
+                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = W * op(T) */
+                cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper),
+                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
+                             M2, K,
+                             zone,
+                             (cuDoubleComplex*)T, LDT,
+                             (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - W * op(T) */
+                for(j = 0; j < K; j++) {
+                    cublasZaxpy(M1, mzone,
+                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - W * op(V)  */
+                cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2),
+                            M2, N2, K,
+                            mzone,
+                            (cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
+                            (cuDoubleComplex*)V     /* K*N2  */, LDV,
+                            zone,
+                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
+
+            } else {
+                /* A1 = A1 - W * opt(T) */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans),
+                             M1, K, K,
+                             mzone, WORK, LDWORK,
+                                    T,    LDT,
+                             zone,  A1,   LDA1 );
+
+                /* Wc = op(T) * V */
+                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2),
+                             K, N2, K,
+                             zone,  T,     LDT,
+                                    V,     LDV,
+                             zzero, WORKC, LDWORKC );
+
+                /* A2 = A2 - W * Wc */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
+                             M2, N2, K,
+                             mzone, WORK,  LDWORK,
+                                    WORKC, LDWORKC,
+                             zone,  A2,    LDA2 );
+            }
+        }
+    }
+    else {
+        fprintf(stderr, "Not implemented (Backward / Left or Right)");
+        return MAGMA_ERR_NOT_SUPPORTED;
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif
diff --git a/cudablas/compute/cuda_zpotrf.c b/cudablas/compute/cuda_zpotrf.c
new file mode 100644
index 0000000000000000000000000000000000000000..0c4b608fbec70a9c7ca6a4eb19c43077fc3077e8
--- /dev/null
+++ b/cudablas/compute/cuda_zpotrf.c
@@ -0,0 +1,43 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zpotrf.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zpotrf(
+        magma_uplo_t uplo,  magma_int_t n,
+        magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
+{
+    int ret;
+    ret = magma_zpotrf_gpu(
+        uplo,
+        n, dA, ldda, info);
+/*  hA, stream );*/
+     if (ret != MAGMA_SUCCESS) {
+        fprintf(stderr, "Error in MAGMA: %d\n", ret);
+        exit(-1);
+    }
+     return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zssssm.c b/cudablas/compute/cuda_zssssm.c
new file mode 100644
index 0000000000000000000000000000000000000000..86349db0dd218ffdad0c23b1c60c7ee44af41c55
--- /dev/null
+++ b/cudablas/compute/cuda_zssssm.c
@@ -0,0 +1,46 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zssssm.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+#if defined(HAVE_MAGMA_GETRF_INCPIV_GPU)
+int CUDA_zssssm(
+        magma_storev_t storev, magma_int_t m1, magma_int_t n1,
+        magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib,
+        magmaDoubleComplex *dA1, magma_int_t ldda1,
+        magmaDoubleComplex *dA2, magma_int_t ldda2,
+        magmaDoubleComplex *dL1, magma_int_t lddl1,
+        magmaDoubleComplex *dL2, magma_int_t lddl2,
+        magma_int_t *IPIV, magma_int_t *info)
+{
+    magma_zssssm_gpu(
+        storev, m1, n1, m2, n2, k, ib,
+        dA1, ldda1, dA2, ldda2,
+        dL1, lddl1, dL2, lddl2,
+        IPIV, info);
+    return MORSE_SUCCESS;
+}
+#endif
+#endif
diff --git a/cudablas/compute/cuda_ztrtri.c b/cudablas/compute/cuda_ztrtri.c
new file mode 100644
index 0000000000000000000000000000000000000000..e10b207af9e3a78ca21b9dceaa6727fed97cb4e5
--- /dev/null
+++ b/cudablas/compute/cuda_ztrtri.c
@@ -0,0 +1,40 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztrtri.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztrtri(
+        magma_uplo_t uplo,  magma_diag_t diag, magma_int_t n,
+        magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info)
+{
+    int ret;
+    ret = magma_ztrtri_gpu( uplo, diag, n, dA, ldda, info);
+     if (ret != MAGMA_SUCCESS) {
+        fprintf(stderr, "Error in MAGMA: %d\n", ret);
+        exit(-1);
+    }
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_ztslqt.c b/cudablas/compute/cuda_ztslqt.c
new file mode 100644
index 0000000000000000000000000000000000000000..5fb1e4e92fc34846411a7b898f7c53948176a859
--- /dev/null
+++ b/cudablas/compute/cuda_ztslqt.c
@@ -0,0 +1,181 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztslqt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztslqt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da1, magma_int_t ldda1,
+        magmaDoubleComplex *da2, magma_int_t ldda2,
+        magmaDoubleComplex *a2,  magma_int_t lda2,
+        magmaDoubleComplex *dt,  magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream)
+{
+#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1))
+#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1))
+#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1))
+#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1))
+#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1))
+#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1))
+
+    int i, k, lddwork, old_i, old_ib, rows, cols;
+    int ib;
+    double _Complex one=1.;
+
+    if (m < 0) {
+        return -1;
+    } else if (n < 0) {
+        return -2;
+    } else if (ldda2 < max(1,m)) {
+        return -4;
+    }
+
+    k = min(m,n);
+    if (k == 0) {
+        hwork[0] = *((magmaDoubleComplex*) &one);
+        return MAGMA_SUCCESS;
+    }
+
+    lddwork= m;
+
+    /* lower parts of little T must be zero: memset all to 0 for simplicity */
+    memset(t, 0, nb*n*sizeof(magmaDoubleComplex));
+    cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
+
+    k = min(m, nb); // m can be lower than IB
+    /* copy the first diag tile of A1 from device to host: da1 -> d */
+    cublasGetMatrix(k, k, sizeof(magmaDoubleComplex),
+                    da1_ref(0, 0), ldda1,
+                    d, ldd);
+
+    /* copy first panel of A2 from device to host: da2 -> a2 */
+    cublasGetMatrix(k, n, sizeof(magmaDoubleComplex),
+                    da2_ref(0, 0), ldda2,
+                    a2, lda2);
+
+    /* This is only blocked code for now */
+    for (i = 0; i < m; i += nb) {
+
+        ib = min(m-i, nb);
+        cols = n;
+
+        /* Send the next panel (diagonal block of A1 & block column of A2)
+           to the CPU (in work_a1 and work_a2) and apply tsmqr update on the
+           remaining non updated panels */
+        if (i>0) {
+
+            /* copy the diag tile of A1 from device to host: da1 -> d */
+            cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                            da1_ref(i, i), ldda1,
+                            d, ldd);
+
+            /* copy panel of A2 from device to host: da2 -> a2 */
+            cublasGetMatrix(ib, cols, sizeof(magmaDoubleComplex),
+                            da2_ref(i, 0), ldda2,
+                            a2, lda2);
+
+            /* Apply H' to A(i+2*ib:m,i:n) from the left */
+            rows = m-old_i-2*old_ib;
+            if (rows > 0){
+                CUDA_ztsmlq(
+                        MagmaRight, MagmaConjTrans,
+                        rows, old_ib, rows, cols, old_ib, old_ib,
+                        da1_ref(old_i+2*old_ib, old_i), ldda1,
+                        da2_ref(old_i+2*old_ib, 0), ldda2,
+                        da2_ref(old_i, 0), ldda2,
+                        dt_ref(0, old_i), lddt,
+                        dwork, lddwork,
+                        dwork + nb * lddwork, nb,
+                        stream );
+            }
+
+        }
+
+        /* compute LQ factorization of the panel of A2 ib x cols */
+        CORE_ztslqt(ib, cols, ib,
+                    (double _Complex*) d, ldd,
+                    (double _Complex*) a2, lda2,
+                    (double _Complex*) t, ldt,
+                    (double _Complex*) tau,
+                    (double _Complex*) hwork);
+
+        /* Send the panel from A2 back to the GPU */
+        cublasSetMatrix(ib, cols, sizeof(magmaDoubleComplex),
+                        a2, lda2,
+                        da2_ref(i, 0), ldda2);
+
+        /* Send the triangular factor T from hwork to the GPU */
+        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                        t, ldt,
+                        dt_ref(0, i), lddt);
+
+        /* get back the diag tile in A1 from host to device: d -> da1 */
+        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                        d, ldd,
+                        da1_ref(i, i), ldda1);
+
+        /* tsmlq update on one panel forward (look ahead 1) */
+        if (i + ib < m) {
+
+            if (i+2*ib < m){
+                rows = ib;
+            }
+            else{
+                rows = m-i-ib;
+            }
+
+            /* Apply H' to A(i+ib:i+2*ib,i:n) from the right */
+            CUDA_ztsmlq(
+                    MagmaRight, MagmaConjTrans,
+                    rows, ib, rows, cols, ib, ib,
+                    da1_ref(i+ib, i), ldda1,
+                    da2_ref(i+ib, 0), ldda2,
+                    da2_ref(i, 0), ldda2,
+                    dt_ref(0, i), lddt,
+                    dwork, lddwork,
+                    dwork + nb * lddwork, nb,
+                    stream );
+            old_i = i;
+            old_ib = ib;
+        }
+    }
+
+#undef da1_ref
+#undef da2_ref
+#undef a2_ref
+#undef t_ref
+#undef dt_ref
+#undef d_ref
+
+    return MORSE_SUCCESS;
+
+}
+#endif
diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c
new file mode 100644
index 0000000000000000000000000000000000000000..67eb42a1412c158b15564cd37c68cbbac2b5d88e
--- /dev/null
+++ b/cudablas/compute/cuda_ztsmlq.c
@@ -0,0 +1,156 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztsmlq.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztsmlq(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t IB,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+              magmaDoubleComplex *WORK,  magma_int_t LDWORK,
+              magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream)
+{
+    int i, i1, i3;
+    int NW;
+    int kb;
+    int ic = 0;
+    int jc = 0;
+    int mi = M1;
+    int ni = N1;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+
+    /* NW is the minimum dimension of WORK */
+    if (side == MagmaLeft) {
+        NW = IB;
+    }
+    else {
+        NW = N1;
+    }
+
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if (M1 < 0) {
+        return -3;
+    }
+    if (N1 < 0) {
+        return -4;
+    }
+    if ( (M2 < 0) ||
+         ( (M2 != M1) && (side == MagmaRight) ) ){
+        return -5;
+    }
+    if ( (N2 < 0) ||
+         ( (N2 != N1) && (side == MagmaLeft) ) ){
+        return -6;
+    }
+    if ((K < 0) ||
+        ( (side == MagmaLeft)  && (K > M1) ) ||
+        ( (side == MagmaRight) && (K > N1) ) ) {
+        return -7;
+    }
+    if (IB < 0) {
+        return -8;
+    }
+    if (LDA1 < max(1,M1)){
+        return -10;
+    }
+    if (LDA2 < max(1,M2)){
+        return -12;
+    }
+    if (LDV < max(1,K)){
+        return -14;
+    }
+    if (LDT < max(1,IB)){
+        return -16;
+    }
+    if (LDWORK < max(1,NW)){
+        return -18;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+        return MAGMA_SUCCESS;
+
+    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
+        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ((K-1) / IB)*IB;
+        i3 = -IB;
+    }
+
+    if (trans == MagmaNoTrans) {
+        trans = MagmaConjTrans;
+    }
+    else {
+        trans = MagmaNoTrans;
+    }
+
+    for(i = i1; (i > -1) && (i < K); i += i3) {
+        kb = min(IB, K-i);
+
+        if (side == MagmaLeft) {
+            /*
+             * H or H' is applied to C(i:m,1:n)
+             */
+            mi = M1 - i;
+            ic = i;
+        }
+        else {
+            /*
+             * H or H' is applied to C(1:m,i:n)
+             */
+            ni = N1 - i;
+            jc = i;
+        }
+
+        /*
+         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
+         */
+        CUDA_zparfb(
+                side, trans, MagmaForward, MagmaRowwise,
+                mi, ni, M2, N2, kb, 0,
+                A1 + LDA1*jc+ic, LDA1,
+                A2, LDA2,
+                V + i, LDV,
+                T + LDT*i, LDT,
+                WORK, LDWORK, WORKC, LDWORKC, stream );
+    }
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c
new file mode 100644
index 0000000000000000000000000000000000000000..de9c05dc986d7593c0ecf505325762301d788aac
--- /dev/null
+++ b/cudablas/compute/cuda_ztsmqr.c
@@ -0,0 +1,704 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztsmqr.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztsmqr(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t IB,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+            magmaDoubleComplex *WORK,  magma_int_t LDWORK,
+            magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream)
+{
+    int i, i1, i3;
+    int NQ, NW;
+    int kb;
+    int ic = 0;
+    int jc = 0;
+    int mi = M1;
+    int ni = N1;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+
+    /* NQ is the order of Q */
+    if (side == MagmaLeft) {
+        NQ = M2;
+        NW = IB;
+    }
+    else {
+        NQ = N2;
+        NW = M1;
+    }
+
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if (M1 < 0) {
+        return -3;
+    }
+    if (N1 < 0) {
+        return -4;
+    }
+    if ( (M2 < 0) ||
+         ( (M2 != M1) && (side == MagmaRight) ) ){
+        return -5;
+    }
+    if ( (N2 < 0) ||
+         ( (N2 != N1) && (side == MagmaLeft) ) ){
+        return -6;
+    }
+    if ((K < 0) ||
+        ( (side == MagmaLeft)  && (K > M1) ) ||
+        ( (side == MagmaRight) && (K > N1) ) ) {
+        return -7;
+    }
+    if (IB < 0) {
+        return -8;
+    }
+    if (LDA1 < max(1,M1)){
+        return -10;
+    }
+    if (LDA2 < max(1,M2)){
+        return -12;
+    }
+    if (LDV < max(1,NQ)){
+        return -14;
+    }
+    if (LDT < max(1,IB)){
+        return -16;
+    }
+    if (LDWORK < max(1,NW)){
+        return -18;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+        return MAGMA_SUCCESS;
+
+    if (((side == MagmaLeft)  && (trans != MagmaNoTrans))
+        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ((K-1) / IB)*IB;
+        i3 = -IB;
+    }
+
+    for(i = i1; (i > -1) && (i < K); i += i3) {
+        kb = min(IB, K-i);
+
+        if (side == MagmaLeft) {
+            /*
+             * H or H' is applied to C(i:m,1:n)
+             */
+            mi = M1 - i;
+            ic = i;
+        }
+        else {
+            /*
+             * H or H' is applied to C(1:m,i:n)
+             */
+            ni = N1 - i;
+            jc = i;
+        }
+        /*
+         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
+         */
+        CUDA_zparfb(
+                side, trans, MagmaForward, MagmaColumnwise,
+                mi, ni, M2, N2, kb, 0,
+                A1 + LDA1*jc+ic, LDA1,
+                A2, LDA2,
+                V + LDV*i, LDV,
+                T + LDT*i, LDT,
+                WORK, LDWORK, WORKC, LDWORKC, stream );
+    }
+    return MORSE_SUCCESS;
+}
+
+#if defined(CHAMELEON_USE_CUBLAS_V2)
+int CUDA_zparfb(
+        magma_side_t side, magma_trans_t trans,
+        magma_direct_t direct, magma_storev_t storev,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t L,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK,
+        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream)
+
+{
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
+    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
+#else
+    double zzero = 0.0;
+    double zone  = 1.0;
+    double mzone = -1.0;
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+    int j;
+    magma_trans_t transW;
+    magma_trans_t transA2;
+    cublasHandle_t handle;
+    cublasStatus_t stat;
+    cublasOperation_t cublasTrans;
+    cublasOperation_t cublasTransW;
+    cublasOperation_t cublasTransA2;
+
+    stat = cublasCreate(&handle);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("CUBLAS initialization failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    stat = cublasSetStream(handle, stream);
+    if (stat != CUBLAS_STATUS_SUCCESS) {
+        printf ("cublasSetStream failed\n");
+        assert( stat == CUBLAS_STATUS_SUCCESS );
+    }
+
+    if (trans == MagmaNoTrans){
+        cublasTrans = CUBLAS_OP_N;
+    }else if(trans == MagmaTrans){
+        cublasTrans = CUBLAS_OP_T;
+    }else if(trans == MagmaConjTrans){
+        cublasTrans = CUBLAS_OP_C;
+    }else{
+        fprintf(stderr, "Error in CUDA_zparfb: bad trans parameter %d\n", trans);
+    }
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
+        return -3;
+    }
+    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
+        return -4;
+    }
+    if (M1 < 0) {
+        return -5;
+    }
+    if (N1 < 0) {
+        return -6;
+    }
+    if ((M2 < 0) ||
+        ( (side == MagmaRight) && (M1 != M2) ) ) {
+        return -7;
+    }
+    if ((N2 < 0) ||
+        ( (side == MagmaLeft) && (N1 != N2) ) ) {
+        return -8;
+    }
+    if (K < 0) {
+        return -9;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (direct == MagmaForward) {
+
+        if (side == MagmaLeft) {
+
+            /*
+             * Column or Rowwise / Forward / Left
+             * ----------------------------------
+             *
+             * Form  H * A  or  H' * A  where  A = ( A1 )
+             *                                     ( A2 )
+             */
+
+            /*
+             * W = A1 + V' * A2:
+             *      W = A1
+             *      W = W + V' * A2
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               K * sizeof(cuDoubleComplex), N1,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW);
+            }
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, cublasTransW, CUBLAS_OP_N,
+                        K, N1, M2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)V     /* K*M2  */, LDV,
+                        (const cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = op(T) * W */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_LEFT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    K, N2,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - op(T) * W */
+                for(j = 0; j < N1; j++) {
+                    cublasZaxpy(handle, K, (const cuDoubleComplex *) &mzone,
+                                (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - op(V) * W  */
+                cublasZgemm(handle, cublasTransA2, CUBLAS_OP_N,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)V     /* M2*K  */, LDV,
+                            (const cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
+
+            } else {
+                /* Wc = V * op(T) */
+                cublasZgemm( handle, cublasTransA2, cublasTrans,
+                             M2, K, K,
+                             (const cuDoubleComplex *) &zone,  V,     LDV,
+                                    T,     LDT,
+                             (const cuDoubleComplex *) &zzero, WORKC, LDWORKC );
+
+                /* A1 = A1 - opt(T) * W */
+                cublasZgemm( handle, cublasTrans, CUBLAS_OP_N,
+                             K, N1, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)T,    LDT,
+                             (const cuDoubleComplex *)WORK, LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex*)A1,   LDA1 );
+
+                /* A2 = A2 - Wc * W */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+        else {
+            /*
+             * Column or Rowwise / Forward / Right
+             * -----------------------------------
+             *
+             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
+             *
+             */
+
+            /*
+             * W = A1 + A2 * V':
+             *      W = A1
+             *      W = W + A2 * V'
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               M1 * sizeof(cuDoubleComplex), K,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+
+            if (transW == MagmaNoTrans){
+                cublasTransW = CUBLAS_OP_N;
+            }else if(transW == MagmaTrans){
+                cublasTransW = CUBLAS_OP_T;
+            }else if(transW == MagmaConjTrans){
+                cublasTransW = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transW parameter %d\n", transW);
+            }
+            if (transA2 == MagmaNoTrans){
+                cublasTransA2 = CUBLAS_OP_N;
+            }else if(transA2 == MagmaTrans){
+                cublasTransA2 = CUBLAS_OP_T;
+            }else if(transA2 == MagmaConjTrans){
+                cublasTransA2 = CUBLAS_OP_C;
+            }else{
+                fprintf(stderr, "Error in CUDA_zparfb: bad transA2 parameter %d\n", transA2);
+            }
+
+            cublasZgemm(handle, CUBLAS_OP_N, cublasTransW,
+                        M1, K, N2,
+                        (const cuDoubleComplex *) &zone,
+                        (const cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
+                        (const cuDoubleComplex*)V     /* N2*K  */, LDV,
+                        (const cuDoubleComplex *) &zone,
+                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = W * op(T) */
+                cublasZtrmm( handle,
+                    CUBLAS_SIDE_RIGHT, CUBLAS_FILL_MODE_UPPER,
+                    cublasTrans, CUBLAS_DIAG_NON_UNIT,
+                    M2, K,
+                    (const cuDoubleComplex *) &zone,
+                    (const cuDoubleComplex*)T, LDT,
+                    (cuDoubleComplex*)WORK, LDWORK,
+                    (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - W * op(T) */
+                for(j = 0; j < K; j++) {
+                    cublasZaxpy(handle, M1, (const cuDoubleComplex *) &mzone,
+                        (const cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                        (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - W * op(V)  */
+                cublasZgemm(handle, CUBLAS_OP_N, cublasTransA2,
+                            M2, N2, K,
+                            (const cuDoubleComplex *) &mzone,
+                            (const cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
+                            (const cuDoubleComplex*)V     /* K*N2  */, LDV,
+                            (const cuDoubleComplex *) &zone,
+                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
+
+            } else {
+                /* A1 = A1 - W * opt(T) */
+                cublasZgemm( handle, CUBLAS_OP_N, cublasTrans,
+                    M1, K, K,
+                    (const cuDoubleComplex *) &mzone,
+                    (const cuDoubleComplex *)WORK, LDWORK,
+                    (const cuDoubleComplex *)T,    LDT,
+                    (const cuDoubleComplex *) &zone,
+                    (cuDoubleComplex *)A1,   LDA1 );
+
+                /* Wc = op(T) * V */
+                cublasZgemm( handle, cublasTrans, cublasTransA2,
+                             K, N2, K,
+                             (const cuDoubleComplex *) &zone,
+                             (const cuDoubleComplex *)T,     LDT,
+                             (const cuDoubleComplex *)V,     LDV,
+                             (const cuDoubleComplex *) &zzero,
+                             (cuDoubleComplex *)WORKC, LDWORKC );
+
+                /* A2 = A2 - W * Wc */
+                cublasZgemm( handle, CUBLAS_OP_N, CUBLAS_OP_N,
+                             M2, N2, K,
+                             (const cuDoubleComplex *) &mzone,
+                             (const cuDoubleComplex *)WORK,  LDWORK,
+                             (const cuDoubleComplex *)WORKC, LDWORKC,
+                             (const cuDoubleComplex *) &zone,
+                             (cuDoubleComplex *)A2,    LDA2 );
+            }
+        }
+    }
+    else {
+        fprintf(stderr, "Not implemented (Backward / Left or Right)");
+        return MAGMA_ERR_NOT_SUPPORTED;
+    }
+
+    cublasDestroy(handle);
+
+    return MORSE_SUCCESS;
+}
+#else /* CHAMELEON_USE_CUBLAS_V2 */
+int CUDA_zparfb(
+        magma_side_t side, magma_trans_t trans,
+        magma_direct_t direct, magma_storev_t storev,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t L,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK,
+        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream)
+{
+#if defined(PRECISION_z) || defined(PRECISION_c)
+    cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
+    cuDoubleComplex zone  = make_cuDoubleComplex(1.0, 0.0);
+    cuDoubleComplex mzone = make_cuDoubleComplex(-1.0, 0.0);
+#else
+    double zzero = 0.0;
+    double zone  = 1.0;
+    double mzone = -1.0;
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+    int j;
+    magma_trans_t transW;
+    magma_trans_t transA2;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if ((direct != MagmaForward) && (direct != MagmaBackward)) {
+        return -3;
+    }
+    if ((storev != MagmaColumnwise) && (storev != MagmaRowwise)) {
+        return -4;
+    }
+    if (M1 < 0) {
+        return -5;
+    }
+    if (N1 < 0) {
+        return -6;
+    }
+    if ((M2 < 0) ||
+        ( (side == MagmaRight) && (M1 != M2) ) ) {
+        return -7;
+    }
+    if ((N2 < 0) ||
+        ( (side == MagmaLeft) && (N1 != N2) ) ) {
+        return -8;
+    }
+    if (K < 0) {
+        return -9;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (direct == MagmaForward) {
+
+        if (side == MagmaLeft) {
+
+            /*
+             * Column or Rowwise / Forward / Left
+             * ----------------------------------
+             *
+             * Form  H * A  or  H' * A  where  A = ( A1 )
+             *                                     ( A2 )
+             */
+
+            /*
+             * W = A1 + V' * A2:
+             *      W = A1
+             *      W = W + V' * A2
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               K * sizeof(cuDoubleComplex), N1,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+            transA2 = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+
+            cublasZgemm(morse_lapack_const(transW), morse_lapack_const(MagmaNoTrans),
+                        K, N1, M2,
+                        zone,
+                        (cuDoubleComplex*)V     /* K*M2  */, LDV,
+                        (cuDoubleComplex*)A2    /* M2*N1 */, LDA2,
+                        zone,
+                        (cuDoubleComplex*)WORK  /* K*N1  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = op(T) * W */
+                cublasZtrmm( morse_lapack_const(MagmaLeft), morse_lapack_const(MagmaUpper),
+                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
+                             K, N2,
+                             zone,
+                             (cuDoubleComplex*)T, LDT,
+                             (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - op(T) * W */
+                for(j = 0; j < N1; j++) {
+                    cublasZaxpy(K, mzone,
+                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - op(V) * W  */
+                cublasZgemm(morse_lapack_const(transA2), morse_lapack_const(MagmaNoTrans),
+                            M2, N2, K,
+                            mzone,
+                            (cuDoubleComplex*)V     /* M2*K  */, LDV,
+                            (cuDoubleComplex*)WORK  /* K*N2  */, LDWORK,
+                            zone,
+                            (cuDoubleComplex*)A2    /* m2*N2 */, LDA2);
+
+            } else {
+                /* Wc = V * op(T) */
+                cublasZgemm( morse_lapack_const(transA2), morse_lapack_const(trans),
+                             M2, K, K,
+                             zone,  V,     LDV,
+                                    T,     LDT,
+                             zzero, WORKC, LDWORKC );
+
+                /* A1 = A1 - opt(T) * W */
+                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(MagmaNoTrans),
+                             K, N1, K,
+                             mzone, T,    LDT,
+                                    WORK, LDWORK,
+                             zone,  A1,   LDA1 );
+
+                /* A2 = A2 - Wc * W */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
+                             M2, N2, K,
+                             mzone, WORKC, LDWORKC,
+                                    WORK,  LDWORK,
+                             zone,  A2,    LDA2 );
+            }
+        }
+        else {
+            /*
+             * Column or Rowwise / Forward / Right
+             * -----------------------------------
+             *
+             * Form  H * A  or  H' * A  where A  = ( A1 A2 )
+             *
+             */
+
+            /*
+             * W = A1 + A2 * V':
+             *      W = A1
+             *      W = W + A2 * V'
+             *
+             */
+            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
+                               A1,   LDA1   * sizeof(cuDoubleComplex),
+                               M1 * sizeof(cuDoubleComplex), K,
+                               cudaMemcpyDeviceToDevice, stream );
+
+            transW  = storev == MorseColumnwise ? MagmaNoTrans : MagmaConjTrans;
+            transA2 = storev == MorseColumnwise ? MagmaConjTrans : MagmaNoTrans;
+
+            cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transW),
+                        M1, K, N2,
+                        zone,
+                        (cuDoubleComplex*)A2    /* M1*N2 */, LDA2,
+                        (cuDoubleComplex*)V     /* N2*K  */, LDV,
+                        zone,
+                        (cuDoubleComplex*)WORK  /* M1*K  */, LDWORK);
+
+            if (WORKC == NULL) {
+                /* W = W * op(T) */
+                cublasZtrmm( morse_lapack_const(MagmaRight), morse_lapack_const(MagmaUpper),
+                             morse_lapack_const(trans), morse_lapack_const(MagmaNonUnit),
+                             M2, K,
+                             zone,
+                             (cuDoubleComplex*)T, LDT,
+                             (cuDoubleComplex*)WORK, LDWORK);
+
+
+                /* A1 = A1 - W = A1 - W * op(T) */
+                for(j = 0; j < K; j++) {
+                    cublasZaxpy(M1, mzone,
+                                (cuDoubleComplex*)(WORK + LDWORK*j), 1,
+                                (cuDoubleComplex*)(A1 + LDA1*j), 1);
+                }
+
+                /* A2 = A2 - W * op(V)  */
+                cublasZgemm(morse_lapack_const(MagmaNoTrans), morse_lapack_const(transA2),
+                            M2, N2, K,
+                            mzone,
+                            (cuDoubleComplex*)WORK  /* M2*K  */, LDWORK,
+                            (cuDoubleComplex*)V     /* K*N2  */, LDV,
+                            zone,
+                            (cuDoubleComplex*)A2    /* M2*N2 */, LDA2);
+
+            } else {
+                /* A1 = A1 - W * opt(T) */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(trans),
+                             M1, K, K,
+                             mzone, WORK, LDWORK,
+                                    T,    LDT,
+                             zone,  A1,   LDA1 );
+
+                /* Wc = op(T) * V */
+                cublasZgemm( morse_lapack_const(trans), morse_lapack_const(transA2),
+                             K, N2, K,
+                             zone,  T,     LDT,
+                                    V,     LDV,
+                             zzero, WORKC, LDWORKC );
+
+                /* A2 = A2 - W * Wc */
+                cublasZgemm( morse_lapack_const(MagmaNoTrans), morse_lapack_const(MagmaNoTrans),
+                             M2, N2, K,
+                             mzone, WORK,  LDWORK,
+                                    WORKC, LDWORKC,
+                             zone,  A2,    LDA2 );
+            }
+        }
+    }
+    else {
+        fprintf(stderr, "Not implemented (Backward / Left or Right)");
+        return MAGMA_ERR_NOT_SUPPORTED;
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif /* CHAMELEON_USE_CUBLAS_V2 */
+#endif
diff --git a/cudablas/compute/cuda_ztsqrt.c b/cudablas/compute/cuda_ztsqrt.c
new file mode 100644
index 0000000000000000000000000000000000000000..8750e5a6bfd4e50d1a509d29a7cd3dd5662f2be8
--- /dev/null
+++ b/cudablas/compute/cuda_ztsqrt.c
@@ -0,0 +1,203 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztsqrt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztsqrt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da1, magma_int_t ldda1,
+        magmaDoubleComplex *da2, magma_int_t ldda2,
+        magmaDoubleComplex *a2,  magma_int_t lda2,
+        magmaDoubleComplex *dt,  magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream)
+{
+#define da1_ref(a_1,a_2) ( da1+(a_2)*ldda1 + (a_1))
+#define da2_ref(a_1,a_2) ( da2+(a_2)*ldda2 + (a_1))
+#define a2_ref(a_1,a_2) ( a2+(a_2)*lda2 + (a_1))
+#define t_ref(a_1,a_2) ( t+(a_2)*ldt + (a_1))
+#define dt_ref(a_1,a_2) ( dt+(a_2)*lddt + (a_1))
+#define d_ref(a_1,a_2) ( d+(a_2)*ldd + (a_1))
+
+    int i, k, lddwork, old_i, old_ib, rows, cols;
+    int ib;
+    double _Complex one=1.;
+//  int lwkopt = n * nb;
+//  hwork[0] = *((magmaDoubleComplex*) &lwkopt);
+
+    if (m < 0) {
+        return -1;
+    } else if (n < 0) {
+        return -2;
+    } else if (ldda2 < max(1,m)) {
+        return -4;
+    }
+
+    k = min(m,n);
+    if (k == 0) {
+        hwork[0] = *((magmaDoubleComplex*) &one);
+        return MAGMA_SUCCESS;
+    }
+
+    lddwork= nb;
+
+    /* lower parts of little T must be zero: memset all to 0 for simplicity */
+    memset(t, 0, nb*nb*sizeof(magmaDoubleComplex));
+    cudaMemset(dt, 0, nb*n*sizeof(magmaDoubleComplex));
+
+    /* copy the first diag tile of A1 from device to host: da1 -> d */
+    cublasGetMatrix(nb, nb, sizeof(magmaDoubleComplex),
+                    da1_ref(0, 0), ldda1,
+                    d, ldd);
+//  cudaMemcpy( d, da1_ref(0,0),
+//              nb*nb*sizeof(cuDoubleComplex),
+//              cudaMemcpyDeviceToHost );
+
+    /* copy first panel of A2 from device to host: da2 -> a2 */
+//  cublasGetMatrix(m, nb, sizeof(magmaDoubleComplex),
+//                    da2_ref(0, 0), ldda2,
+//                    a2, lda2);
+    cudaMemcpy( a2, da2_ref(0, 0),
+                m*nb*sizeof(cuDoubleComplex),
+                cudaMemcpyDeviceToHost );
+
+    /* This is only blocked code for now */
+    for (i = 0; i < n; i += nb) {
+
+        ib = min(n-i, nb);
+        rows = m;
+
+        /* Send the next panel (diagonal block of A1 & block column of A2)
+           to the CPU (in work_a1 and work_a2) and apply tsmqr update on the
+           remaining non updated panels */
+        if (i>0) {
+
+            /* copy the diag tile of A1 from device to host: da1 -> d */
+            cublasGetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                            da1_ref(i, i), ldda1,
+                            d, ldd);
+//          cudaMemcpy( d, da1_ref(i,i),
+//              ib*ib*sizeof(cuDoubleComplex),
+//              cudaMemcpyDeviceToHost );
+
+            /* copy panel of A2 from device to host: da2 -> a2 */
+            cublasGetMatrix(rows, ib, sizeof(magmaDoubleComplex),
+                            da2_ref(0, i), ldda2,
+                            a2, lda2);
+//            cudaMemcpy( a2, da2_ref(0,i),
+//                rows*ib*sizeof(cuDoubleComplex),
+//                cudaMemcpyDeviceToHost );
+
+            /* Apply H' to A(i:m,i+2*ib:n) from the left */
+            cols = n-old_i-2*old_ib;
+            if (cols > 0){
+                CUDA_ztsmqr(
+                        MagmaLeft, MagmaConjTrans,
+                        old_ib, cols, rows, cols, old_ib, old_ib,
+                        da1_ref(old_i, old_i+2*old_ib), ldda1,
+                        da2_ref(0, old_i+2*old_ib), ldda2,
+                        da2_ref(0, old_i), ldda2,
+                        dt_ref(0, old_i), lddt,
+                        dwork, old_ib,
+                        dwork + old_ib * cols, rows,
+                        stream );
+            }
+
+        }
+
+        /* compute QR factorization of the panel of A2 rows x ib */
+        CORE_ztsqrt(rows, ib, ib,
+                    (double _Complex*) d, ldd,
+                    (double _Complex*) a2, lda2,
+                    (double _Complex*) t, ldt,
+                    (double _Complex*) tau,
+                    (double _Complex*) hwork);
+
+        /* Send the panel from A2 back to the GPU */
+        cublasSetMatrix(rows, ib, sizeof(magmaDoubleComplex),
+                        a2, lda2,
+                        da2_ref(0, i), ldda2);
+//        cudaMemcpy( da2_ref(0,i), a2,
+//            rows*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
+
+        /* Send the triangular factor T from hwork to the GPU */
+        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                        t, ldt,
+                        dt_ref(0, i), lddt);
+//        cudaMemcpy( dt_ref(0,i), t,
+//            ib*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
+
+        /* get back the diag tile in A1 from host to device: d -> da1 */
+        cublasSetMatrix(ib, ib, sizeof(magmaDoubleComplex),
+                        d, ldd,
+                        da1_ref(i, i), ldda1);
+//        cudaMemcpy( da1_ref(i, i), d,
+//            ib*ib*sizeof(cuDoubleComplex),
+//            cudaMemcpyHostToDevice );
+
+        /* tsmqr update on one panel forward (look ahead 1) */
+        if (i + ib < n) {
+
+            if (i+2*ib < n){
+                cols = ib;
+            }
+            else{
+                cols = n-i-ib;
+            }
+
+            /* Apply H' to A(i:m,i+ib:i+2*ib) from the left */
+            CUDA_ztsmqr(
+                    MagmaLeft, MagmaConjTrans,
+                    ib, cols, rows, cols, ib, ib,
+                    da1_ref(i, i+ib), ldda1,
+                    da2_ref(0, i+ib), ldda2,
+                    da2_ref(0, i), ldda2,
+                    dt_ref(0, i), lddt,
+                    dwork, ib,
+                    dwork + ib * cols, rows,
+                    stream );
+            old_i = i;
+            old_ib = ib;
+        }
+    }
+
+#undef da1_ref
+#undef da2_ref
+#undef a2_ref
+#undef t_ref
+#undef dt_ref
+#undef d_ref
+
+    return MORSE_SUCCESS;
+
+}
+#endif
diff --git a/cudablas/compute/cuda_ztstrf.c b/cudablas/compute/cuda_ztstrf.c
new file mode 100644
index 0000000000000000000000000000000000000000..1632ba706bd266e01dfef1af586f88d9e6bca6bf
--- /dev/null
+++ b/cudablas/compute/cuda_ztstrf.c
@@ -0,0 +1,48 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_ztstrf.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_ztstrf(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb,
+        cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu,
+        cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda,
+        cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl,
+        magma_int_t *ipiv,
+        cuDoubleComplex *hwork, magma_int_t ldhwork,
+        cuDoubleComplex *dwork, magma_int_t lddwork,
+        magma_int_t *info)
+{
+
+    magma_ztstrf_gpu( storev, m, n, ib, nb,
+                      hU, ldhu, dU, lddu,
+                      hA, ldha, dA, ldda,
+                      hL, ldhl, dL, lddl,
+                      ipiv,
+                      hwork, ldhwork, dwork, lddwork,
+                      info );
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zunmlqt.c b/cudablas/compute/cuda_zunmlqt.c
new file mode 100644
index 0000000000000000000000000000000000000000..5268acea872e5d73644e3efe53b0cc9f26b4f8dd
--- /dev/null
+++ b/cudablas/compute/cuda_zunmlqt.c
@@ -0,0 +1,133 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zunmlqt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zunmlqt(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
+        const magmaDoubleComplex *A,    magma_int_t LDA,
+        const magmaDoubleComplex *T,    magma_int_t LDT,
+        magmaDoubleComplex *C,    magma_int_t LDC,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+{
+    int i, kb;
+    int i1, i3;
+    int nq, nw;
+    int ic = 0;
+    int jc = 0;
+    int ni = N;
+    int mi = M;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    /*
+     * NQ is the order of Q and NW is the minimum dimension of WORK
+     */
+    if (side == MagmaLeft) {
+        nq = M;
+        nw = N;
+    }
+    else {
+        nq = N;
+        nw = M;
+    }
+
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if (M < 0) {
+        return -3;
+    }
+    if (N < 0) {
+        return -4;
+    }
+    if ((K < 0) || (K > nq)) {
+        return -5;
+    }
+    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
+        return -6;
+    }
+    if ((LDA < max(1,K)) && (K > 0)) {
+        return -8;
+    }
+    if ((LDC < max(1,M)) && (M > 0)) {
+        return -12;
+    }
+    if ((LDWORK < max(1,nw)) && (nw > 0)) {
+        return -14;
+    }
+
+    /* Quick return */
+    if ((M == 0) || (N == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (((side == MagmaLeft) && (trans == MagmaNoTrans))
+        || ((side == MagmaRight) && (trans != MagmaNoTrans))) {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ( ( K-1 ) / IB )*IB;
+        i3 = -IB;
+    }
+
+    if( trans == MorseNoTrans) {
+        trans = MorseConjTrans;
+    }
+    else {
+        trans = MorseNoTrans;
+    }
+
+    for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
+        kb = min(IB, K-i);
+
+        if (side == MagmaLeft) {
+            /*
+             * H or H' is applied to C(i:m,1:n)
+             */
+            mi = M - i;
+            ic = i;
+        }
+        else {
+            /*
+             * H or H' is applied to C(1:m,i:n)
+             */
+            ni = N - i;
+            jc = i;
+        }
+
+        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaRowwise,
+                          mi, ni, kb,
+                          A + LDA * i  + i,  LDA,
+                          T + LDT * i,       LDT,
+                          C + LDC * jc + ic, LDC,
+                          WORK, LDWORK);
+    }
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/compute/cuda_zunmqrt.c b/cudablas/compute/cuda_zunmqrt.c
new file mode 100644
index 0000000000000000000000000000000000000000..786d0184119cbb7bbd827a0e33071cbedfda0391
--- /dev/null
+++ b/cudablas/compute/cuda_zunmqrt.c
@@ -0,0 +1,127 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cuda_zunmqrt.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zunmqrt(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
+        const magmaDoubleComplex *A,    magma_int_t LDA,
+        const magmaDoubleComplex *T,    magma_int_t LDT,
+        magmaDoubleComplex *C,    magma_int_t LDC,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK )
+{
+    int i, kb;
+    int i1, i3;
+    int nq, nw;
+    int ic = 0;
+    int jc = 0;
+    int ni = N;
+    int mi = M;
+
+    /* Check input arguments */
+    if ((side != MagmaLeft) && (side != MagmaRight)) {
+        return -1;
+    }
+    /*
+     * NQ is the order of Q and NW is the minimum dimension of WORK
+     */
+    if (side == MagmaLeft) {
+        nq = M;
+        nw = N;
+    }
+    else {
+        nq = N;
+        nw = M;
+    }
+
+    if ((trans != MagmaNoTrans) && (trans != MagmaConjTrans)) {
+        return -2;
+    }
+    if (M < 0) {
+        return -3;
+    }
+    if (N < 0) {
+        return -4;
+    }
+    if ((K < 0) || (K > nq)) {
+        return -5;
+    }
+    if ((IB < 0) || ( (IB == 0) && ((M > 0) && (N > 0)) )) {
+        return -6;
+    }
+    if ((LDA < max(1,nq)) && (nq > 0)) {
+        return -8;
+    }
+    if ((LDC < max(1,M)) && (M > 0)) {
+        return -12;
+    }
+    if ((LDWORK < max(1,nw)) && (nw > 0)) {
+        return -14;
+    }
+
+    /* Quick return */
+    if ((M == 0) || (N == 0) || (K == 0))
+        return MAGMA_SUCCESS;
+
+    if (((side == MagmaLeft) && (trans != MagmaNoTrans))
+        || ((side == MagmaRight) && (trans == MagmaNoTrans))) {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ( ( K-1 ) / IB )*IB;
+        i3 = -IB;
+    }
+
+    for(i = i1; (i >- 1) && (i < K); i+=i3 ) {
+        kb = min(IB, K-i);
+
+        if (side == MagmaLeft) {
+            /*
+             * H or H' is applied to C(i:m,1:n)
+             */
+            mi = M - i;
+            ic = i;
+        }
+        else {
+            /*
+             * H or H' is applied to C(1:m,i:n)
+             */
+            ni = N - i;
+            jc = i;
+        }
+
+        magma_zlarfb_gpu( side, trans, MagmaForward, MagmaColumnwise,
+                          mi, ni, kb,
+                          A + LDA * i  + i,  LDA,
+                          T + LDT * i,       LDT,
+                          C + LDC * jc + ic, LDC,
+                          WORK, LDWORK);
+    }
+
+    return MORSE_SUCCESS;
+}
+#endif
diff --git a/cudablas/include/CMakeLists.txt b/cudablas/include/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9635e1f953ce9a4a86f51043b911a395d25147b6
--- /dev/null
+++ b/cudablas/include/CMakeLists.txt
@@ -0,0 +1,58 @@
+###
+#
+# @copyright (c) 2009-2014 The University of Tennessee and The University
+#                          of Tennessee Research Foundation.
+#                          All rights reserved.
+# @copyright (c) 2012-2014 Inria. All rights reserved.
+# @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+#
+###
+#
+#  @file CMakeLists.txt
+#
+#  @project MORSE
+#  MORSE is a software package provided by:
+#     Inria Bordeaux - Sud-Ouest,
+#     Univ. of Tennessee,
+#     King Abdullah Univesity of Science and Technology
+#     Univ. of California Berkeley,
+#     Univ. of Colorado Denver.
+#
+#  @author Florent Pruvost
+#  @date 16-09-2015
+#
+###
+
+# Generate header files
+# ---------------------
+set(CUDABLAS_HDRS_GENERATED "")
+set(ZHDR
+    cudablas_z.h
+)
+precisions_rules_py(CUDABLAS_HDRS_GENERATED "${ZHDR}"
+                    PRECISIONS "s;d;c;z;zc;ds" )
+
+# Define the list of headers
+# --------------------------
+set(CUDABLAS_HDRS
+    cudablas.h
+    ${CUDABLAS_HDRS_GENERATED}
+    )
+
+# Force generation of headers
+# ---------------------------
+add_custom_target(cudablas_include ALL SOURCES ${CUDABLAS_HDRS})
+
+set(HDR_INSTALL "cudablas.h")
+foreach( hdr_file ${CUDABLAS_HDRS_GENERATED} )
+    list(APPEND HDR_INSTALL ${CMAKE_CURRENT_BINARY_DIR}/${hdr_file})
+endforeach()
+
+# installation
+# ------------
+install(FILES ${HDR_INSTALL}
+        DESTINATION include/chameleon/cudablas/include)
+
+###
+### END CMakeLists.txt
+###
diff --git a/cudablas/include/cudablas.h b/cudablas/include/cudablas.h
new file mode 100644
index 0000000000000000000000000000000000000000..ab0e6eee738dc71b3bdb391d49c436f283d43de1
--- /dev/null
+++ b/cudablas/include/cudablas.h
@@ -0,0 +1,81 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cudablas.h
+ *
+ *  MORSE cudablas headers
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#ifndef _CUDA_BLAS_H_
+#define _CUDA_BLAS_H_
+
+#include <stdio.h>
+#include <math.h>
+#include <string.h>
+
+#if defined(CHAMELEON_USE_CUDA)
+#include <cuda.h>
+#include <cuComplex.h>
+#if defined(CHAMELEON_USE_MAGMA)
+#include <magma.h>
+#endif
+/** ****************************************************************************
+ * CUDA BLAS headers
+ **/
+#include "cudablas/include/cudablas_z.h"
+#include "cudablas/include/cudablas_d.h"
+#include "cudablas/include/cudablas_c.h"
+#include "cudablas/include/cudablas_s.h"
+#endif
+
+/** ****************************************************************************
+ * MORSE types and constants
+ **/
+#include "morse_types.h"
+#include "morse_struct.h"
+#include "morse_constants.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/*******************************************************************************
+ *  Global utilities
+ **/
+#ifndef max
+#define max(a, b) ((a) > (b) ? (a) : (b))
+#endif
+#ifndef min
+#define min(a, b) ((a) < (b) ? (a) : (b))
+#endif
+#ifndef roundup
+#define roundup(a, b) (b <= 0) ? (a) : (((a) + (b)-1) & ~((b)-1))
+#endif
+
+/** ****************************************************************************
+ *  LAPACK Constants
+ **/
+extern char *morse_lapack_constants[];
+#define morse_lapack_const(morse_const) morse_lapack_constants[morse_const][0]
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
new file mode 100644
index 0000000000000000000000000000000000000000..c63fa090111d8342aa12490a0ee3bca13079ceee
--- /dev/null
+++ b/cudablas/include/cudablas_z.h
@@ -0,0 +1,197 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2015 Inria. All rights reserved.
+ * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file cudablas_z.h
+ *
+ *  MORSE cudablas headers
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#ifndef _MORSE_CUDA_ZBLAS_H_
+#define _MORSE_CUDA_ZBLAS_H_
+
+#define COMPLEX
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+/** ****************************************************************************
+ *  Declarations of cuda kernels - alphabetical order
+ **/
+#if defined(CHAMELEON_USE_MAGMA)
+int CUDA_zgelqt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da, magma_int_t ldda,
+        magmaDoubleComplex *v,  magma_int_t ldv,
+        magmaDoubleComplex *dt, magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream);
+int CUDA_zgemerge(
+        magma_side_t side, magma_diag_t diag,
+        magma_int_t M, magma_int_t N,
+        magmaDoubleComplex *A, magma_int_t LDA,
+        magmaDoubleComplex *B, magma_int_t LDB,
+        CUstream stream);
+int CUDA_zgeqrt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da, magma_int_t ldda,
+        magmaDoubleComplex *v,  magma_int_t ldv,
+        magmaDoubleComplex *dt, magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream);
+int CUDA_zgessm(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t k, magma_int_t ib,
+        magma_int_t *ipiv,
+        cuDoubleComplex *dL1, magma_int_t lddl1,
+        cuDoubleComplex *dL,  magma_int_t lddl,
+        cuDoubleComplex *dA,  magma_int_t ldda,
+        magma_int_t *info);
+int CUDA_zgetrf_incpiv(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t ib,
+        cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda,
+        cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl,
+        magma_int_t *ipiv,
+        cuDoubleComplex *dwork, magma_int_t lddwork,
+        magma_int_t *info);
+int CUDA_zgetrf_nopiv(
+        magma_int_t m, magma_int_t n,
+        cuDoubleComplex *dA, magma_int_t ldda,
+        magma_int_t *info);
+int CUDA_zlauum(
+        char uplo,  magma_int_t n,
+        cuDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zparfb(
+        magma_side_t side, magma_trans_t trans,
+        magma_direct_t direct, magma_storev_t storev,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t L,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK,
+        magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream);
+int CUDA_zpotrf(
+        magma_uplo_t uplo,  magma_int_t n,
+        magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_zssssm(
+        magma_storev_t storev, magma_int_t m1, magma_int_t n1,
+        magma_int_t m2, magma_int_t n2, magma_int_t k, magma_int_t ib,
+        magmaDoubleComplex *dA1, magma_int_t ldda1,
+        magmaDoubleComplex *dA2, magma_int_t ldda2,
+        magmaDoubleComplex *dL1, magma_int_t lddl1,
+        magmaDoubleComplex *dL2, magma_int_t lddl2,
+        magma_int_t *IPIV, magma_int_t *info);
+int CUDA_ztrtri(
+        magma_uplo_t uplo,  magma_diag_t diag, magma_int_t n,
+        magmaDoubleComplex *dA, magma_int_t ldda, magma_int_t *info);
+int CUDA_ztslqt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da1, magma_int_t ldda1,
+        magmaDoubleComplex *da2, magma_int_t ldda2,
+        magmaDoubleComplex *a2,  magma_int_t lda2,
+        magmaDoubleComplex *dt,  magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream);
+int CUDA_ztsmlq(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t IB,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+              magmaDoubleComplex *WORK,  magma_int_t LDWORK,
+              magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream);
+int CUDA_ztsmqr(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M1, magma_int_t N1,
+        magma_int_t M2, magma_int_t N2,
+        magma_int_t K, magma_int_t IB,
+        magmaDoubleComplex *A1, magma_int_t LDA1,
+        magmaDoubleComplex *A2, magma_int_t LDA2,
+        const magmaDoubleComplex *V, magma_int_t LDV,
+        const magmaDoubleComplex *T, magma_int_t LDT,
+            magmaDoubleComplex *WORK,  magma_int_t LDWORK,
+            magmaDoubleComplex *WORKC, magma_int_t LDWORKC,
+        CUstream stream);
+int CUDA_ztsqrt(
+        magma_int_t m, magma_int_t n, magma_int_t nb,
+        magmaDoubleComplex *da1, magma_int_t ldda1,
+        magmaDoubleComplex *da2, magma_int_t ldda2,
+        magmaDoubleComplex *a2,  magma_int_t lda2,
+        magmaDoubleComplex *dt,  magma_int_t lddt,
+        magmaDoubleComplex *t,  magma_int_t ldt,
+        magmaDoubleComplex *dd,
+        magmaDoubleComplex *d,  magma_int_t ldd,
+        magmaDoubleComplex *tau,
+        magmaDoubleComplex *hwork,
+        magmaDoubleComplex *dwork,
+        CUstream stream);
+int CUDA_ztstrf(
+        char storev, magma_int_t m, magma_int_t n, magma_int_t ib, magma_int_t nb,
+        cuDoubleComplex *hU, magma_int_t ldhu, cuDoubleComplex *dU, magma_int_t lddu,
+        cuDoubleComplex *hA, magma_int_t ldha, cuDoubleComplex *dA, magma_int_t ldda,
+        cuDoubleComplex *hL, magma_int_t ldhl, cuDoubleComplex *dL, magma_int_t lddl,
+        magma_int_t *ipiv,
+        cuDoubleComplex *hwork, magma_int_t ldhwork,
+        cuDoubleComplex *dwork, magma_int_t lddwork,
+        magma_int_t *info);
+int CUDA_zunmlqt(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
+        const magmaDoubleComplex *A,    magma_int_t LDA,
+        const magmaDoubleComplex *T,    magma_int_t LDT,
+        magmaDoubleComplex *C,    magma_int_t LDC,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK );
+int CUDA_zunmqrt(
+        magma_side_t side, magma_trans_t trans,
+        magma_int_t M, magma_int_t N, magma_int_t K, magma_int_t IB,
+        const magmaDoubleComplex *A,    magma_int_t LDA,
+        const magmaDoubleComplex *T,    magma_int_t LDT,
+        magmaDoubleComplex *C,    magma_int_t LDC,
+        magmaDoubleComplex *WORK, magma_int_t LDWORK );
+#endif
+
+#ifdef __cplusplus
+}
+#endif
+
+#undef COMPLEX
+
+#endif