diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index d9859a604616b7b4c8e79f7bc8a32238b68eca40..4a1f8559a9c62bbad84f79c545c89075dd71229d 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -19,7 +19,7 @@
 #
 # @version 1.0.0
 #  @author Florent Pruvost
-#  @date 2015-09-16
+#  @date 2018-11-09
 #
 ###
 
@@ -38,11 +38,13 @@ set(ZSRC
     cuda_zsymm.c
     cuda_zsyr2k.c
     cuda_zsyrk.c
+    cuda_ztpmlqt.c
     cuda_ztpmqrt.c
     cuda_ztrmm.c
     cuda_ztrsm.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    cuda_zttmlq.c
     cuda_zttmqr.c
     cuda_zunmlqt.c
     cuda_zunmqrt.c
diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c
index b44b22ca9225b87dae0bc7cacf0de17050a6a2ef..51fb0f3c70c33d84ea01d9499f51eeac053a0791 100644
--- a/cudablas/compute/cuda_zlarfb.c
+++ b/cudablas/compute/cuda_zlarfb.c
@@ -15,21 +15,21 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int
-CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
-            cham_dir_t direct, cham_store_t storev,
-            int M, int N, int K,
-            const cuDoubleComplex *V, int LDV,
-            const cuDoubleComplex *T, int LDT,
-                  cuDoubleComplex *C, int LDC,
-                  cuDoubleComplex *WORK, int LDWORK,
-            CUBLAS_STREAM_PARAM )
+CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
+             cham_dir_t direct, cham_store_t storev,
+             int M, int N, int K,
+             const cuDoubleComplex *V, int LDV,
+             const cuDoubleComplex *T, int LDT,
+                   cuDoubleComplex *C, int LDC,
+                   cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
@@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
     }
 
     /* Quick return */
-    if ((M == 0) || (N == 0) || (K == 0))
+    if ((M == 0) || (N == 0) || (K == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
     // opposite of trans
-    if (trans == ChamNoTrans)
+    if (trans == ChamNoTrans) {
         transT = ChamConjTrans;
-    else
+    }
+    else {
         transT = ChamNoTrans;
+    }
 
     // whether T is upper or lower triangular
-    if (direct == ChamDirForward)
+    if (direct == ChamDirForward) {
         uplo = ChamUpper;
-    else
+    }
+    else {
         uplo = ChamLower;
+    }
 
     if (storev == ChamColumnwise) {
         notransV = ChamNoTrans;
@@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
         // W = W T^H = C^H V T^H
         CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
                     N, K,
-                    CUBLAS_SADDR(zone), T,    LDT,
-                                        WORK, LDWORK,
+                    &zone, T,    LDT,
+                           WORK, LDWORK,
                     CUBLAS_STREAM_VALUE );
 
         // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
@@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
         // W = W T = C V T
         CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
                     M, K,
-                    CUBLAS_SADDR(zone), T,    LDT,
-                                        WORK, LDWORK,
+                    &zone, T,    LDT,
+                           WORK, LDWORK,
                     CUBLAS_STREAM_VALUE );
 
         // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c
index 292ac3b647cebbee27d15f7336ed80a21c8de9b0..f644fcaeb0c188c9dd3f9caf8e1d499657bb28b5 100644
--- a/cudablas/compute/cuda_zparfb.c
+++ b/cudablas/compute/cuda_zparfb.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -120,40 +120,32 @@
  *         The leading dimension of the array T. LDT >= K.
  *
  * @param[in,out] WORK
- *         Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K
- *         otherwise.
+ *         Workspace of dimension at least:
+ *            - K * (M2 + N2).
+ *         If L > 0, it is recommended to extend it to
+ *            - K * (2 * M2 + N2 ) if side == ChamLeft.
+ *            - K * (M2 + 2 * N2 ) if side == ChamRight.
  *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK: LDWORK >= K, if side ==
- *         ChamLeft, LDWORK >= M1 otehrwise.
- *
- * @param[in,out] WORKC
- *         Optionnal additional workspace to replace the TRMM operation by a GEMM kernel.
- *         This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2
- *         otherwise.
- *
- * @param[in] LDWORKC
- *         The leading dimension of the array WORKC: LDWORKC >= M2, if side ==
- *         ChamLeft, LDWORK >= K otehrwise.
+ * @param[in] LWORK
+ *         The dimension of the array WORK. If LWORK < 0, returns immediately
+ *         the recommended workspace size.
  *
  *******************************************************************************
  *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval  <0 if -i, the i-th argument had an illegal value
+ * @retval  The recommended LWORK value, if LWORK == -1 on entry.
  */
 int
-CUDA_zparfb(cham_side_t side, cham_trans_t trans,
-            cham_dir_t direct, cham_store_t storev,
-            int M1, int N1, int M2, int N2, int K, int L,
-                  cuDoubleComplex *A1, int LDA1,
-                  cuDoubleComplex *A2, int LDA2,
-            const cuDoubleComplex *V, int LDV,
-            const cuDoubleComplex *T, int LDT,
-                  cuDoubleComplex *WORK, int LDWORK,
-                  cuDoubleComplex *WORKC, int LDWORKC,
-            CUBLAS_STREAM_PARAM )
+CUDA_zparfb( cham_side_t side, cham_trans_t trans,
+             cham_dir_t direct, cham_store_t storev,
+             int M1, int N1, int M2, int N2, int K, int L,
+                   cuDoubleComplex *A1, int LDA1,
+                   cuDoubleComplex *A2, int LDA2,
+             const cuDoubleComplex *V, int LDV,
+             const cuDoubleComplex *T, int LDT,
+                   cuDoubleComplex *WORK, int LWORK,
+             CUBLAS_STREAM_PARAM )
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
@@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
     double mzone = -1.0;
 #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
 
+    cuDoubleComplex *workW, *workC, *workV;
+    int ldW, ldC, ldV;
     int j;
     cham_trans_t transW;
     cham_trans_t transA2;
+    int wssize = 0;
+    int wrsize = 0;
 
     CUBLAS_GET_STREAM;
 
@@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
     if (K < 0) {
         return -9;
     }
-    if ( ((LDWORK < K ) && (side == ChamLeft )) ||
-         ((LDWORK < M1) && (side == ChamRight)) ) {
+
+    if (direct == ChamDirForward) {
+        wssize = K * (M2 + N2);
+        wrsize = wssize;
+        if ( L > 0 ) {
+            wrsize +=  (side == ChamLeft) ? M2 * K : K * N2;
+        }
+    }
+
+    if ( LWORK < 0 ) {
+        return wrsize;
+    }
+    else if ( LWORK < wssize ) {
+        cudablas_error(20, "Illegal value of LWORK");
         return -20;
     }
 
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
     if (direct == ChamDirForward) {
 
         if (side == ChamLeft) {
-
             /*
              * Column or Rowwise / Forward / Left
              * ----------------------------------
@@ -222,76 +229,116 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
              *                                     ( A2 )
              */
 
+            /*
+             * Store in WORK (N1 == N2):
+             *    - Workspace W for the copy of A1 + V' * A2 (K  x N1)
+             *    - Workspace C for the copy of V * T        (M2 x K )
+             *    - Workspace V for the copy of V            (M2 x K )
+             */
+            workW = WORK;
+            ldW = K;
+
+            workC = workW + K * N1;
+            ldC = M2;
+
+            if ( L == 0 ) {
+                workV = (cuDoubleComplex*)V;
+                ldV   = LDV;
+            }
+            else {
+                if ( LWORK < wrsize ) {
+                    workC = NULL;
+                    workV = workW + K * N1;
+                }
+                else {
+                    workV = workC + M2 * K;
+                }
+                ldV = M2;
+
+                /*
+                 * Backup V, and put 0 in the lower part
+                 */
+                cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                   V,     LDV * sizeof(cuDoubleComplex),
+                                   M2 * sizeof(cuDoubleComplex), K,
+                                   cudaMemcpyDeviceToDevice, stream );
+
+                for(j = 1; j < K; j++) {
+                    cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
+                                     0.,
+                                     (L - j) * sizeof(cuDoubleComplex),
+                                     stream );
+                }
+            }
+
             /*
              * W = A1 + V' * A2:
              *      W = A1
              *      W = W + V' * A2
              *
              */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
+            cudaMemcpy2DAsync( workW, ldW  * sizeof(cuDoubleComplex),
+                               A1,    LDA1 * sizeof(cuDoubleComplex),
                                K * sizeof(cuDoubleComplex), N1,
                                cudaMemcpyDeviceToDevice, stream );
 
             transW  = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
             transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
 
-            cublasZgemm(CUBLAS_HANDLE
-                        chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
-                        K, N1, M2,
-                        CUBLAS_SADDR(zone),
-                        V     /* K*M2  */, LDV,
-                        A2    /* M2*N1 */, LDA2,
-                        CUBLAS_SADDR(zone),
-                        WORK  /* K*N1  */, LDWORK);
-
-            if (WORKC == NULL) {
+            cublasZgemm( CUBLAS_HANDLE
+                         chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
+                         K, N1, M2,
+                         CUBLAS_SADDR(zone), workV /* M2*K  */, ldV,
+                                             A2    /* M2*N2 */, LDA2,
+                         CUBLAS_SADDR(zone), workW /* K *N2 */, ldW );
+
+            if ( workC == NULL ) {
                 /* W = op(T) * W */
                 CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit,
                             K, N2,
-                            CUBLAS_SADDR(zone), T,    LDT,
-                                                WORK, LDWORK,
+                            &zone, T,     LDT,
+                                   workW, ldW,
                             CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - op(T) * W */
                 for(j = 0; j < N1; j++) {
-                    cublasZaxpy(CUBLAS_HANDLE
-                                K, CUBLAS_SADDR(mzone),
-                                (WORK + LDWORK*j), 1,
-                                (A1 + LDA1*j),     1);
+                    cublasZaxpy( CUBLAS_HANDLE
+                                 K, CUBLAS_SADDR(mzone),
+                                 workW + ldW  * j, 1,
+                                 A1    + LDA1 * j, 1 );
                 }
 
                 /* A2 = A2 - op(V) * W  */
-                cublasZgemm(CUBLAS_HANDLE
-                            chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
-                            M2, N2, K,
-                            CUBLAS_SADDR(mzone), V    /* M2*K  */, LDV,
-                                                 WORK /* K*N2  */, LDWORK,
-                            CUBLAS_SADDR(zone),  A2   /* m2*N2 */, LDA2);
+                cublasZgemm( CUBLAS_HANDLE
+                             chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
+                             M2, N2, K,
+                             CUBLAS_SADDR(mzone), V     /* M2 * K  */, LDV,
+                                                  workW /* K  * N2 */, ldW,
+                             CUBLAS_SADDR(zone),  A2    /* M2 * N2 */, LDA2 );
 
             } else {
                 /* Wc = V * op(T) */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(transA2), chameleon_cublas_const(trans),
                              M2, K, K,
-                             CUBLAS_SADDR(zone),  V, LDV,
-                                                  T, LDT,
-                             CUBLAS_SADDR(zzero), WORKC, LDWORKC );
+                             CUBLAS_SADDR(zone),  workV, ldV,
+                                                  T,     LDT,
+                             CUBLAS_SADDR(zzero), workC, ldC );
 
                 /* A1 = A1 - opt(T) * W */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
                              K, N1, K,
-                             CUBLAS_SADDR(mzone), T,    LDT,
-                                                  WORK, LDWORK,
-                             CUBLAS_SADDR(zone),  A1,   LDA1 );
+                             CUBLAS_SADDR(mzone), T,     LDT,
+                                                  workW, ldW,
+                             CUBLAS_SADDR(zone),  A1,    LDA1 );
 
                 /* A2 = A2 - Wc * W */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
                              M2, N2, K,
-                             CUBLAS_SADDR(mzone), WORKC, LDWORKC,
-                                                  WORK,  LDWORK,
+                             CUBLAS_SADDR(mzone), workC, ldC,
+                                                  workW, ldW,
                              CUBLAS_SADDR(zone),  A2,    LDA2 );
             }
         }
@@ -304,14 +351,56 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
              *
              */
 
+            /*
+             * Store in WORK (M1 == M2):
+             *    - Workspace W for the copy of A1 + A2 * V' (M1 x K )
+             *    - Workspace C for the copy of V * T        (K  x N2)
+             *    - Workspace V for the copy of V            (K  x N2)
+             */
+            workW = WORK;
+            ldW = M1;
+
+            workC = workW + M1 * K;
+            ldC = K;
+
+            if ( L == 0 ) {
+                workV = (cuDoubleComplex*)V;
+                ldV   = LDV;
+            }
+            else {
+                if ( LWORK < wrsize ) {
+                    workC = NULL;
+                    workV = workW + M2 * K;
+                }
+                else {
+                    workV = workC + K * N2;
+                }
+                ldV = K;
+
+                /*
+                 * Backup V, and put 0 in the upper part
+                 */
+                cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                   V,     LDV * sizeof(cuDoubleComplex),
+                                   K * sizeof(cuDoubleComplex), N2,
+                                   cudaMemcpyDeviceToDevice, stream );
+
+                for(j = 1; j < K; j++) {
+                    cudaMemsetAsync( workV + ldV + N2 - L + j,
+                                     0.,
+                                     j * sizeof(cuDoubleComplex),
+                                     stream );
+                }
+            }
+
             /*
              * W = A1 + A2 * V':
              *      W = A1
              *      W = W + A2 * V'
              *
              */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
+            cudaMemcpy2DAsync( workW, ldW  * sizeof(cuDoubleComplex),
+                               A1,    LDA1 * sizeof(cuDoubleComplex),
                                M1 * sizeof(cuDoubleComplex), K,
                                cudaMemcpyDeviceToDevice, stream );
 
@@ -321,40 +410,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
             cublasZgemm(CUBLAS_HANDLE
                         chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW),
                         M1, K, N2,
-                        CUBLAS_SADDR(zone), A2   /* M1*N2 */, LDA2,
-                                            V    /* N2*K  */, LDV,
-                        CUBLAS_SADDR(zone), WORK /* M1*K  */, LDWORK);
+                        CUBLAS_SADDR(zone), A2    /* M1*N2 */, LDA2,
+                                            workV /* K *N2 */, ldV,
+                        CUBLAS_SADDR(zone), workW /* M1*K  */, ldW);
 
-            if (WORKC == NULL) {
+            if ( workC == NULL ) {
                 /* W = W * op(T) */
                 CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit,
                             M2, K,
-                            CUBLAS_SADDR(zone), T,    LDT,
-                                                WORK, LDWORK,
+                            &zone, T,     LDT,
+                                   workW, ldW,
                             CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - W * op(T) */
                 for(j = 0; j < K; j++) {
-                    cublasZaxpy(CUBLAS_HANDLE
-                                M1, CUBLAS_SADDR(mzone),
-                                (WORK + LDWORK*j), 1,
-                                (A1 + LDA1*j), 1);
+                    cublasZaxpy( CUBLAS_HANDLE
+                                 M1, CUBLAS_SADDR(mzone),
+                                 workW + ldW  * j, 1,
+                                 A1    + LDA1 * j, 1 );
                 }
 
                 /* A2 = A2 - W * op(V)  */
                 cublasZgemm(CUBLAS_HANDLE
                             chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
                             M2, N2, K,
-                            CUBLAS_SADDR(mzone), WORK /* M2*K  */, LDWORK,
-                                                 V    /* K*N2  */, LDV,
-                            CUBLAS_SADDR(zone),  A2   /* M2*N2 */, LDA2);
+                            CUBLAS_SADDR(mzone), workW /* M2*K  */, ldW,
+                                                 V     /* K *N2 */, LDV,
+                            CUBLAS_SADDR(zone),  A2    /* M2*N2 */, LDA2);
 
             } else {
                 /* A1 = A1 - W * opt(T) */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans),
                              M1, K, K,
-                             CUBLAS_SADDR(mzone), WORK, LDWORK,
+                             CUBLAS_SADDR(mzone), workW, ldW,
                                                   T,    LDT,
                              CUBLAS_SADDR(zone),  A1,   LDA1 );
 
@@ -363,15 +452,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
                              chameleon_cublas_const(trans), chameleon_cublas_const(transA2),
                              K, N2, K,
                              CUBLAS_SADDR(zone),  T,     LDT,
-                                                  V,     LDV,
-                             CUBLAS_SADDR(zzero), WORKC, LDWORKC );
+                                                  workV, ldV,
+                             CUBLAS_SADDR(zzero), workC, ldC );
 
                 /* A2 = A2 - W * Wc */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
                              M2, N2, K,
-                             CUBLAS_SADDR(mzone), WORK,  LDWORK,
-                                                  WORKC, LDWORKC,
+                             CUBLAS_SADDR(mzone), workW, ldW,
+                                                  workC, ldC,
                              CUBLAS_SADDR(zone),  A2,    LDA2 );
             }
         }
diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f01e0e28cf34690cc3e3daa1823c418e9557fac
--- /dev/null
+++ b/cudablas/compute/cuda_ztpmlqt.c
@@ -0,0 +1,184 @@
+/**
+ *
+ * @file cuda_ztpmlqt.c
+ *
+ * @copyright 2009-2016 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_ztpmlqt GPU kernel
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2018-11-09
+ * @precisions normal z -> c d s
+ *
+ */
+#include "cudablas.h"
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * @brief Applies a complex orthogonal matrix Q.
+ *
+ * The matrix Q is obtained from a "triangular-pentagonal" complex block
+ * reflector H to a general complex matrix C, which consists of two blocks A and
+ * B.
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg ChamLeft  : apply Q or Q**H from the Left;
+ *         @arg ChamRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg ChamNoTrans   :  No transpose, apply Q;
+ *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N
+ *         The number of columns of the tile B. N >= 0.
+ *
+ * @param[in] K
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          K >= L >= 0.  See Further Details.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTPQRT in the first k rows of its array argument V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[in] T
+ *         The IB-by-N1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[in,out] A
+ *         A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
+ *         or (LDA,K) if SIDE = ChamRight
+ *         On entry, the K-by-N or M-by-K matrix A.
+ *         On exit, A is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A. LDA >= max(1,M).
+ *         If side = ChamLeft,  LDA >= max(1,K);
+ *         If side = Chamright, LDA >= max(1,M).
+ *
+ * @param[in,out] B
+ *         On entry, the M-by-N tile B.
+ *         On exit, B is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDB
+ *         The leading dimension of the tile B. LDB >= max(1,M).
+ *
+ * @param[out] WORK
+ *         Workspace array of size LDWORK-by-NB.
+ *         LDWORK = N if side = ChamLeft, or  M if side = ChamRight.
+ *
+ *******************************************************************************
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The columns of the pentagonal matrix V contain the elementary reflectors
+ *  H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
+ *  trapezoidal block V2:
+ *
+ *        V = [V1] [V2].
+ *
+ *  The size of the trapezoidal block V2 is determined by the parameter L,
+ *  where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L
+ *  rows of a K-by-K upper triangular matrix.  If L=K, V2 is lower triangular;
+ *  if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
+ *
+ *  If side = ChamLeft:  C = [A]  where A is K-by-N,  B is M-by-N and V is K-by-M.
+ *                            [B]
+ *
+ *  If side = ChamRight: C = [A B]  where A is M-by-K, B is M-by-N and V is K-by-N.
+ *
+ *  The complex orthogonal matrix Q is formed from V and T.
+ *
+ *  If trans='N' and side='L', C is on exit replaced with Q * C.
+ *
+ *  If trans='C' and side='L', C is on exit replaced with Q**H * C.
+ *
+ *  If trans='N' and side='R', C is on exit replaced with C * Q.
+ *
+ *  If trans='C' and side='R', C is on exit replaced with C * Q**H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
+int
+CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
+              int M, int N, int K, int L, int IB,
+              const cuDoubleComplex *V, int LDV,
+              const cuDoubleComplex *T, int LDT,
+                    cuDoubleComplex *A, int LDA,
+                    cuDoubleComplex *B, int LDB,
+                    cuDoubleComplex *WORK, int lwork,
+              CUBLAS_STREAM_PARAM )
+{
+    int m1, n1;
+
+    /* Check input arguments */
+    if ((side != ChamLeft) && (side != ChamRight)) {
+        cudablas_error(1, "Illegal value of side");
+        return -1;
+    }
+
+    if ( side == ChamLeft ) {
+        m1 = K;
+        n1 = N;
+    }
+    else {
+        m1 = M;
+        n1 = K;
+    }
+
+    /* TS case */
+    if (L == 0) {
+        CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, lwork,
+                     CUBLAS_STREAM_VALUE );
+    }
+    /* TT case */
+    else  if( L == N ) {
+        CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, lwork,
+                     CUBLAS_STREAM_VALUE );
+    }
+    else {
+        cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" );
+        return -6;
+    }
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c
index 2719edd32b1fef478dd150f67b89ba0c8dc465e2..c7a19fd76726491b982c0da9be7c0ff444fd829c 100644
--- a/cudablas/compute/cuda_ztpmqrt.c
+++ b/cudablas/compute/cuda_ztpmqrt.c
@@ -13,12 +13,128 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * @brief Applies a complex orthogonal matrix Q.
+ *
+ * The matrix Q is obtained from a "triangular-pentagonal" complex block
+ * reflector H to a general complex matrix C, which consists of two blocks A and
+ * B.
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg ChamLeft  : apply Q or Q**H from the Left;
+ *         @arg ChamRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg ChamNoTrans   :  No transpose, apply Q;
+ *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N
+ *         The number of columns of the tile B. N >= 0.
+ *
+ * @param[in] K
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          K >= L >= 0.  See Further Details.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTPQRT in the first k rows of its array argument V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[in] T
+ *         The IB-by-N1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[in,out] A
+ *         A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
+ *         or (LDA,K) if SIDE = ChamRight
+ *         On entry, the K-by-N or M-by-K matrix A.
+ *         On exit, A is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A. LDA >= max(1,M).
+ *         If side = ChamLeft,  LDA >= max(1,K);
+ *         If side = Chamright, LDA >= max(1,M).
+ *
+ * @param[in,out] B
+ *         On entry, the M-by-N tile B.
+ *         On exit, B is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDB
+ *         The leading dimension of the tile B. LDB >= max(1,M).
+ *
+ * @param[out] WORK
+ *         Workspace array of size LDWORK-by-NB.
+ *         LDWORK = N if side = ChamLeft, or  M if side = ChamRight.
+ *
+ *******************************************************************************
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The columns of the pentagonal matrix V contain the elementary reflectors
+ *  H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
+ *  trapezoidal block V2:
+ *
+ *        V = [V1]
+ *            [V2].
+ *
+ *  The size of the trapezoidal block V2 is determined by the parameter L,
+ *  where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L
+ *  rows of a K-by-K upper triangular matrix.  If L=K, V2 is upper triangular;
+ *  if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
+ *
+ *  If side = ChamLeft:  C = [A]  where A is K-by-N,  B is M-by-N and V is M-by-K.
+ *                            [B]
+ *
+ *  If side = ChamRight: C = [A B]  where A is M-by-K, B is M-by-N and V is N-by-K.
+ *
+ *  The complex orthogonal matrix Q is formed from V and T.
+ *
+ *  If trans='N' and side='L', C is on exit replaced with Q * C.
+ *
+ *  If trans='C' and side='L', C is on exit replaced with Q**H * C.
+ *
+ *  If trans='N' and side='R', C is on exit replaced with C * Q.
+ *
+ *  If trans='C' and side='R', C is on exit replaced with C * Q**H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
 int
 CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
               int M, int N, int K, int L, int IB,
@@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
               const cuDoubleComplex *T, int LDT,
                     cuDoubleComplex *A, int LDA,
                     cuDoubleComplex *B, int LDB,
-                    cuDoubleComplex *WORK,
+                    cuDoubleComplex *WORK, int lwork,
               CUBLAS_STREAM_PARAM )
 {
-    int m1, n1, ldwork, ldworkc, ws;
+    int m1, n1;
 
     /* Check input arguments */
     if ((side != ChamLeft) && (side != ChamRight)) {
@@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
     if ( side == ChamLeft ) {
         m1 = K;
         n1 = N;
-        ldwork  = IB;
-        ldworkc = M;
-        ws = ldwork * n1;
     }
     else {
         m1 = M;
         n1 = K;
-        ldwork = chameleon_max( K, chameleon_max( M, N ) );
-        ldworkc = IB;
-        ws = ldwork * IB;
     }
 
     /* TS case */
     if (L == 0) {
         CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB,
                      A, LDA, B, LDB, V, LDV, T, LDT,
-                     WORK, ldwork, WORK + ws, ldworkc,
+                     WORK, lwork,
                      CUBLAS_STREAM_VALUE );
     }
     /* TT case */
     else  if( L == M ) {
         CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
                      A, LDA, B, LDB, V, LDV, T, LDT,
-                     WORK, ldwork, WORK + ws, ldworkc,
+                     WORK, lwork,
                      CUBLAS_STREAM_VALUE );
     }
     else {
diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c
index 7d2e3c4a3fbc19d5ec319991a9d56a537608016b..fa949538f7782eb169a3ceb5fe237f4dc046fe05 100644
--- a/cudablas/compute/cuda_ztsmlq.c
+++ b/cudablas/compute/cuda_ztsmlq.c
@@ -13,24 +13,24 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @author Mathieu Faverge
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int CUDA_ztsmlq(
-        cham_side_t side, cham_trans_t trans,
-        int M1, int N1,
-        int M2, int N2,
-        int K, int IB,
-              cuDoubleComplex *A1,    int LDA1,
-              cuDoubleComplex *A2,    int LDA2,
-        const cuDoubleComplex *V,     int LDV,
-        const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
-        CUBLAS_STREAM_PARAM)
+    cham_side_t side, cham_trans_t trans,
+    int M1, int N1,
+    int M2, int N2,
+    int K, int IB,
+    cuDoubleComplex *A1,    int LDA1,
+    cuDoubleComplex *A2,    int LDA2,
+    const cuDoubleComplex *V,     int LDV,
+    const cuDoubleComplex *T,     int LDT,
+    cuDoubleComplex *WORK,  int LWORK,
+    CUBLAS_STREAM_PARAM)
 {
     int i, i1, i3;
     int NW;
@@ -90,21 +90,20 @@ int CUDA_ztsmlq(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft) && (trans == ChamNoTrans))
-        || ((side == ChamRight) && (trans != ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans == ChamNoTrans))  ||
+         ((side == ChamRight) && (trans != ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
@@ -115,7 +114,7 @@ int CUDA_ztsmlq(
         trans = ChamNoTrans;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -137,13 +136,13 @@ int CUDA_ztsmlq(
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
         CUDA_zparfb(
-                side, trans, ChamDirForward, ChamRowwise,
-                mi, ni, M2, N2, kb, 0,
-                A1 + LDA1*jc+ic, LDA1,
-                A2, LDA2,
-                V + i, LDV,
-                T + LDT*i, LDT,
-                WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            side, trans, ChamDirForward, ChamRowwise,
+            mi, ni, M2, N2, kb, 0,
+            A1 + LDA1*jc+ic, LDA1,
+            A2, LDA2,
+            V + i, LDV,
+            T + LDT*i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c
index 4b07b9b9ce16397eabf8e358d81b7499eeab2c39..6a655a5401d1f1f7830afcbd19d4e2579fb48fa6 100644
--- a/cudablas/compute/cuda_ztsmqr.c
+++ b/cudablas/compute/cuda_ztsmqr.c
@@ -13,24 +13,24 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @author Mathieu Faverge
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int CUDA_ztsmqr(
-        cham_side_t side, cham_trans_t trans,
-        int M1, int N1,
-        int M2, int N2,
-        int K, int IB,
-              cuDoubleComplex *A1,    int LDA1,
-              cuDoubleComplex *A2,    int LDA2,
-        const cuDoubleComplex *V,     int LDV,
-        const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
-        CUBLAS_STREAM_PARAM)
+    cham_side_t side, cham_trans_t trans,
+    int M1, int N1,
+    int M2, int N2,
+    int K, int IB,
+    cuDoubleComplex *A1,    int LDA1,
+    cuDoubleComplex *A2,    int LDA2,
+    const cuDoubleComplex *V,     int LDV,
+    const cuDoubleComplex *T,     int LDT,
+    cuDoubleComplex *WORK,  int LWORK,
+    CUBLAS_STREAM_PARAM)
 {
     int i, i1, i3;
     int NQ, NW;
@@ -92,25 +92,24 @@ int CUDA_ztsmqr(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft)  && (trans != ChamNoTrans))
-        || ((side == ChamRight) && (trans == ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans != ChamNoTrans))  ||
+         ((side == ChamRight) && (trans == ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -127,17 +126,18 @@ int CUDA_ztsmqr(
             ni = N1 - i;
             jc = i;
         }
+
         /*
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
         CUDA_zparfb(
-                side, trans, ChamDirForward, ChamColumnwise,
-                mi, ni, M2, N2, kb, 0,
-                A1 + LDA1*jc+ic, LDA1,
-                A2, LDA2,
-                V + LDV*i, LDV,
-                T + LDT*i, LDT,
-                WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            side, trans, ChamDirForward, ChamColumnwise,
+            mi, ni, M2, N2, kb, 0,
+            A1 + LDA1*jc+ic, LDA1,
+            A2, LDA2,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c
new file mode 100644
index 0000000000000000000000000000000000000000..ddde1ed384df7bcb5ce7360de93bdfc3aaa926e2
--- /dev/null
+++ b/cudablas/compute/cuda_zttmlq.c
@@ -0,0 +1,149 @@
+/**
+ *
+ * @file cuda_zttmlq.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_zttmlq GPU kernel
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @author Mathieu Faverge
+ * @date 2018-11-09
+ * @precisions normal z -> c d s
+ *
+ */
+#include "cudablas.h"
+
+int CUDA_zttmlq(
+        cham_side_t side, cham_trans_t trans,
+        int M1, int N1,
+        int M2, int N2,
+        int K, int IB,
+              cuDoubleComplex *A1,    int LDA1,
+              cuDoubleComplex *A2,    int LDA2,
+        const cuDoubleComplex *V,     int LDV,
+        const cuDoubleComplex *T,     int LDT,
+              cuDoubleComplex *WORK,  int LWORK,
+        CUBLAS_STREAM_PARAM)
+{
+    int i, i1, i3;
+    int NW;
+    int kb, l;
+    int ic = 0;
+    int jc = 0;
+    int mi1 = M1;
+    int mi2 = M2;
+    int ni1 = N1;
+    int ni2 = N2;
+
+    /* Check input arguments */
+    if ((side != ChamLeft) && (side != ChamRight)) {
+        return -1;
+    }
+
+    /* NQ is the order of Q */
+    if (side == ChamLeft) {
+        NW = IB;
+    }
+    else {
+        NW = N1;
+    }
+
+    if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
+        return -2;
+    }
+    if (M1 < 0) {
+        return -3;
+    }
+    if (N1 < 0) {
+        return -4;
+    }
+    if ( (M2 < 0) ||
+         ( (M2 != M1) && (side == ChamRight) ) ){
+        return -5;
+    }
+    if ( (N2 < 0) ||
+         ( (N2 != N1) && (side == ChamLeft) ) ){
+        return -6;
+    }
+    if ((K < 0) ||
+        ( (side == ChamLeft)  && (K > M1) ) ||
+        ( (side == ChamRight) && (K > N1) ) ) {
+        return -7;
+    }
+    if (IB < 0) {
+        return -8;
+    }
+    if (LDA1 < chameleon_max(1,M1)){
+        return -10;
+    }
+    if (LDA2 < chameleon_max(1,M2)){
+        return -12;
+    }
+    if (LDV < chameleon_max(1,K)){
+        return -14;
+    }
+    if (LDT < chameleon_max(1,IB)){
+        return -16;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
+         ((side == ChamRight) && (trans != ChamNoTrans)) )
+    {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ( ( K-1 ) / IB )*IB;
+        i3 = -IB;
+    }
+
+    /* Transpose */
+    if (trans == ChamNoTrans) {
+        trans = ChamConjTrans;
+    }
+    else {
+        trans = ChamNoTrans;
+    }
+
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
+        kb = chameleon_min(IB, K-i);
+
+        if (side == ChamLeft) {
+            mi1 = kb;
+            mi2 = chameleon_min(i+kb, M2);
+            l   = chameleon_min(kb, chameleon_max(0, M2-i));
+            ic  = i;
+        }
+        else {
+            ni1 = kb;
+            ni2 = chameleon_min(i+kb, N2);
+            l   = chameleon_min(kb, chameleon_max(0, N2-i));
+            jc  = i;
+        }
+
+        /*
+         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb)
+         */
+        CUDA_zparfb(
+            side, trans, ChamDirForward, ChamRowwise,
+            mi1, ni1, mi2, ni2, kb, l,
+            A1 + LDA1 * jc + ic, LDA1,
+            A2, LDA2,
+            V + i,       LDV,
+            T + LDT * i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
+    }
+    return CHAMELEON_SUCCESS;
+}
diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c
index 236405cdf080fd3c7941aa62426a70cde2e5d8f7..e2a91ee817915b5453d79bfc987e0345ecab0022 100644
--- a/cudablas/compute/cuda_zttmqr.c
+++ b/cudablas/compute/cuda_zttmqr.c
@@ -6,6 +6,7 @@
  *                      Tennessee Research Foundation. All rights reserved.
  * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
  *                      Univ. Bordeaux. All rights reserved.
+ *
  ***
  *
  * @brief Chameleon cuda_zttmqr GPU kernel
@@ -13,7 +14,7 @@
  * @version 1.0.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -28,13 +29,12 @@ int CUDA_zttmqr(
               cuDoubleComplex *A2,    int LDA2,
         const cuDoubleComplex *V,     int LDV,
         const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
+              cuDoubleComplex *WORK,  int LWORK,
         CUBLAS_STREAM_PARAM)
 {
-    int i, i1, i3, l;
+    int i, i1, i3;
     int NQ, NW;
-    int kb;
+    int kb, l;
     int ic = 0;
     int jc = 0;
     int mi1 = M1;
@@ -94,25 +94,24 @@ int CUDA_zttmqr(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft)  && (trans != ChamNoTrans))
-        || ((side == ChamRight) && (trans == ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) ||
+         ((side == ChamRight) && (trans == ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -138,8 +137,7 @@ int CUDA_zttmqr(
             A2, LDA2,
             V + LDV*i, LDV,
             T + LDT*i, LDT,
-            WORK, LDWORK,
-            WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h
index 8e96d463ca451c74b17f91c22c2b6c03a27a101e..8895ff6485b33c423bd80b5b99599883eec58ce7 100644
--- a/cudablas/include/cudablas/cudablas_z.h
+++ b/cudablas/include/cudablas/cudablas_z.h
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -31,16 +31,18 @@ int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleCom
 int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM );
 int CUDA_zherk(  cham_uplo_t uplo, cham_trans_t trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
 int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
-int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zsymm(  cham_side_t side, cham_uplo_t uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_zsyrk(  cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
-int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM );
+int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
+int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
 int CUDA_ztrmm(  cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
 int CUDA_ztrsm(  cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
-int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
-int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
-int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );