diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c
index 973612e7ad05331f78e8e73744f23cc6caac7ceb..3bd8d9e7c8f686602db7b4233a557fbc44c30014 100644
--- a/compute/pztpgqrt.c
+++ b/compute/pztpgqrt.c
@@ -58,7 +58,7 @@ void morse_pztpgqrt( int L,
     int ib, minMT;
 
     /* Dimension of the first column */
-    int maxm  = Q2->m - L;
+    int maxm  = chameleon_max( Q2->m - L, 1 );
     int maxmt = (maxm % Q2->mb == 0) ? (maxm / Q2->mb) : (maxm / Q2->mb + 1);
     int maxmtk;
 
diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c
index 25af2c85e1ed60ba4b9ddbc39a9bb3d167a7e6c7..1807857a266542ca2a902ea6511df151fcee439c 100644
--- a/compute/pztpqrt.c
+++ b/compute/pztpqrt.c
@@ -45,7 +45,7 @@ void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
     int ib;
 
     /* Dimension of the first column */
-    int maxm  = B->m - L;
+    int maxm  = chameleon_max( B->m - L, 1 );
     int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1);
 
     morse = morse_context_self();
diff --git a/coreblas/compute/core_zttmqr.c b/coreblas/compute/core_zttmqr.c
index 95b8aa2df576d45ecbc8bb8895dc4327889e31c9..95760f3cd10868775a03ca2ac732f8888cf0cbef 100644
--- a/coreblas/compute/core_zttmqr.c
+++ b/coreblas/compute/core_zttmqr.c
@@ -3,9 +3,8 @@
  * @copyright (c) 2009-2014 The University of Tennessee and The University
  *                          of Tennessee Research Foundation.
  *                          All rights reserved.
- * @copyright (c) 2012-2014 Inria. All rights reserved.
- * @copyright (c) 2012-2014 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved.
- *
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
  **/
 
 /**
@@ -247,10 +246,10 @@ int CORE_zttmqr(MORSE_enum side, MORSE_enum trans,
         CORE_zparfb(
             side, trans, MorseForward, MorseColumnwise,
             mi1, ni1, mi2, ni2, kb, l,
-            &A1[LDA1*jc+ic], LDA1,
+            A1 + LDA1*jc+ic, LDA1,
             A2, LDA2,
-            &V[LDV*i], LDV,
-            &T[LDT*i], LDT,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
             WORK, LDWORK);
     }
     return MORSE_SUCCESS;
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index 40686c92fb785f93f8b7ec6de0f545e4759ec770..06d25bd476f7d7f5286e40eefd583cee53d2de2a 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -43,6 +43,7 @@ set(ZSRC
     cuda_ztrsm.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    cuda_zttmqr.c
     cuda_zunmlqt.c
     cuda_zunmqrt.c
     )
diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c
index 83e0b393aedc1488a37d5a7609f75f52b204924f..edca0c8d7e13b14995ac2c74857cdfa5106e38ca 100644
--- a/cudablas/compute/cuda_zparfb.c
+++ b/cudablas/compute/cuda_zparfb.c
@@ -25,6 +25,130 @@
 #include "cudablas/include/cudablas.h"
 #include "cudablas/include/cudablas_z.h"
 
+/**
+ *****************************************************************************
+ *
+ * @ingroup CUDA_MORSE_Complex64_t
+ *
+ *  CUDA_zparfb applies a complex upper triangular block reflector H
+ *  or its transpose H' to a complex rectangular matrix formed by
+ *  coupling two tiles A1 and A2. Matrix V is:
+ *
+ *          COLUMNWISE                    ROWWISE
+ *
+ *         |     K     |                 |      N2-L     |   L  |
+ *      __ _____________ __           __ _________________        __
+ *         |    |      |                 |               | \
+ *         |    |      |                 |               |   \    L
+ *    M2-L |    |      |              K  |_______________|_____\  __
+ *         |    |      | M2              |                      |
+ *      __ |____|      |                 |                      | K-L
+ *         \    |      |              __ |______________________| __
+ *       L   \  |      |
+ *      __     \|______| __              |          N2          |
+ *
+ *         | L |  K-L  |
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg MorseLeft  : apply Q or Q**H from the Left;
+ *         @arg MorseRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg MorseNoTrans   : No transpose, apply Q;
+ *         @arg MorseConjTrans : ConjTranspose, apply Q**H.
+ *
+ * @param[in] direct
+ *         Indicates how H is formed from a product of elementary
+ *         reflectors
+ *         @arg MorseForward  : H = H(1) H(2) . . . H(k) (Forward)
+ *         @arg MorseBackward : H = H(k) . . . H(2) H(1) (Backward)
+ *
+ * @param[in] storev
+ *         Indicates how the vectors which define the elementary
+ *         reflectors are stored:
+ *         @arg MorseColumnwise
+ *         @arg MorseRowwise
+ *
+ * @param[in] M1
+ *         The number of columns of the tile A1. M1 >= 0.
+ *
+ * @param[in] N1
+ *         The number of rows of the tile A1. N1 >= 0.
+ *
+ * @param[in] M2
+ *         The number of columns of the tile A2. M2 >= 0.
+ *
+ * @param[in] N2
+ *         The number of rows of the tile A2. N2 >= 0.
+ *
+ * @param[in] K
+ *         The order of the matrix T (= the number of elementary
+ *         reflectors whose product defines the block reflector).
+ *
+ * @param[in] L
+ *         The size of the triangular part of V
+ *
+ * @param[in,out] A1
+ *         On entry, the M1-by-N1 tile A1.
+ *         On exit, A1 is overwritten by the application of Q.
+ *
+ * @param[in] LDA1
+ *         The leading dimension of the array A1. LDA1 >= max(1,N1).
+ *
+ * @param[in,out] A2
+ *         On entry, the M2-by-N2 tile A2.
+ *         On exit, A2 is overwritten by the application of Q.
+ *
+ * @param[in] LDA2
+ *         The leading dimension of the tile A2. LDA2 >= max(1,N2).
+ *
+ * @param[in] V
+ *         (LDV,K) if STOREV = 'C'
+ *         (LDV,M2) if STOREV = 'R' and SIDE = 'L'
+ *         (LDV,N2) if STOREV = 'R' and SIDE = 'R'
+ *         Matrix V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V.
+ *         If STOREV = 'C' and SIDE = 'L', LDV >= max(1,M2);
+ *         if STOREV = 'C' and SIDE = 'R', LDV >= max(1,N2);
+ *         if STOREV = 'R', LDV >= K.
+ *
+ * @param[out] T
+ *         The triangular K-by-K matrix T in the representation of the
+ *         block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= K.
+ *
+ * @param[in,out] WORK
+ *         Workspace of dimension LDWORK-by-N1 if side == MorseLeft, LDWORK-by-K
+ *         otherwise.
+ *
+ * @param[in] LDWORK
+ *         The leading dimension of the array WORK: LDWORK >= K, if side ==
+ *         MorseLeft, LDWORK >= M1 otehrwise.
+ *
+ * @param[in,out] WORKC
+ *         Optionnal additional workspace to replace the TRMM operation by a GEMM kernel.
+ *         This workspace is of dimension LDWORK-by-K if side == MorseLeft, LDWORK-by-N2
+ *         otherwise.
+ *
+ * @param[in] LDWORKC
+ *         The leading dimension of the array WORKC: LDWORKC >= M2, if side ==
+ *         MorseLeft, LDWORK >= K otehrwise.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
 int
 CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
             MORSE_enum direct, MORSE_enum storev,
@@ -83,6 +207,10 @@ CUDA_zparfb(MORSE_enum side, MORSE_enum trans,
     if (K < 0) {
         return -9;
     }
+    if ( ((LDWORK < K ) && (side == MorseLeft )) ||
+         ((LDWORK < M1) && (side == MorseRight)) ) {
+        return -20;
+    }
 
     /* Quick return */
     if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c
index b9f6afac4cabd9a66c017c29e16abad91e7a2b97..350dc3df8dd730314599cc023c9228066e8e58ec 100644
--- a/cudablas/compute/cuda_ztpmqrt.c
+++ b/cudablas/compute/cuda_ztpmqrt.c
@@ -48,14 +48,14 @@ CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans,
         n1 = N;
         ldwork  = IB;
         ldworkc = M;
-        ws = K * n1;
+        ws = ldwork * n1;
     }
     else {
         m1 = M;
         n1 = K;
         ldwork  = m1;
         ldworkc = IB;
-        ws = m1 * K;
+        ws = ldwork * IB;
     }
 
     /* TS case */
@@ -67,16 +67,14 @@ CUDA_ztpmqrt( MORSE_enum side, MORSE_enum trans,
     }
     /* TT case */
     else  if( L == M ) {
-        cudablas_error(-6, "TTMQRT not available on GPU yet\n" );
-        return -6;
-        /* CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB, */
-        /*              A, LDA, B, LDB, V, LDV, T, LDT, */
-        /*              WORK, ldwork ); */
+        CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, ldwork, WORK + ws, ldworkc,
+                     CUBLAS_STREAM_VALUE );
     }
     else {
-        cudablas_error(-6, "TPMQRT not available on GPU yet\n" );
+        cudablas_error(-6, "TPMQRT not available on GPU for general cases yet\n" );
         return -6;
-        //LAPACKE_ztpmqrt_work( LAPACK_COL_MAJOR, M, N, K, L, IB, V, LDV, T, LDT, A, LDA, B, LDB, WORK );
     }
 
     return MORSE_SUCCESS;
diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c
new file mode 100644
index 0000000000000000000000000000000000000000..04f59bf8140801312f635e3d801efd4ff23cbcaf
--- /dev/null
+++ b/cudablas/compute/cuda_zttmqr.c
@@ -0,0 +1,151 @@
+/**
+ *
+ * @copyright (c) 2009-2014 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2017 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ **/
+
+/**
+ *
+ * @file cuda_zttmqr.c
+ *
+ *  MORSE cudablas kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver,
+ *  and INRIA Bordeaux Sud-Ouest
+ *
+ * @author Florent Pruvost
+ * @author Mathieu Faverge
+ * @date 2015-09-16
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "cudablas/include/cudablas.h"
+#include "cudablas/include/cudablas_z.h"
+
+int CUDA_zttmqr(
+        MORSE_enum side, MORSE_enum trans,
+        int M1, int N1,
+        int M2, int N2,
+        int K, int IB,
+              cuDoubleComplex *A1,    int LDA1,
+              cuDoubleComplex *A2,    int LDA2,
+        const cuDoubleComplex *V,     int LDV,
+        const cuDoubleComplex *T,     int LDT,
+              cuDoubleComplex *WORK,  int LDWORK,
+              cuDoubleComplex *WORKC, int LDWORKC,
+        CUBLAS_STREAM_PARAM)
+{
+    int i, i1, i3, l;
+    int NQ, NW;
+    int kb;
+    int ic = 0;
+    int jc = 0;
+    int mi1 = M1;
+    int mi2 = M2;
+    int ni1 = N1;
+    int ni2 = N2;
+
+    /* Check input arguments */
+    if ((side != MorseLeft) && (side != MorseRight)) {
+        return -1;
+    }
+
+    /* NQ is the order of Q */
+    if (side == MorseLeft) {
+        NQ = M2;
+        NW = IB;
+    }
+    else {
+        NQ = N2;
+        NW = M1;
+    }
+
+    if ((trans != MorseNoTrans) && (trans != MorseConjTrans)) {
+        return -2;
+    }
+    if (M1 < 0) {
+        return -3;
+    }
+    if (N1 < 0) {
+        return -4;
+    }
+    if ( (M2 < 0) ||
+         ( (M2 != M1) && (side == MorseRight) ) ){
+        return -5;
+    }
+    if ( (N2 < 0) ||
+         ( (N2 != N1) && (side == MorseLeft) ) ){
+        return -6;
+    }
+    if ((K < 0) ||
+        ( (side == MorseLeft)  && (K > M1) ) ||
+        ( (side == MorseRight) && (K > N1) ) ) {
+        return -7;
+    }
+    if (IB < 0) {
+        return -8;
+    }
+    if (LDA1 < chameleon_max(1,M1)){
+        return -10;
+    }
+    if (LDA2 < chameleon_max(1,M2)){
+        return -12;
+    }
+    if (LDV < chameleon_max(1,NQ)){
+        return -14;
+    }
+    if (LDT < chameleon_max(1,IB)){
+        return -16;
+    }
+    if (LDWORK < chameleon_max(1,NW)){
+        return -18;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+        return MORSE_SUCCESS;
+
+    if (((side == MorseLeft)  && (trans != MorseNoTrans))
+        || ((side == MorseRight) && (trans == MorseNoTrans))) {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ((K-1) / IB)*IB;
+        i3 = -IB;
+    }
+
+    for(i = i1; (i > -1) && (i < K); i += i3) {
+        kb = chameleon_min(IB, K-i);
+
+        if (side == MorseLeft) {
+            mi1 = kb;
+            mi2 = chameleon_min(i+kb, M2);
+            l   = chameleon_min(kb, chameleon_max(0, M2-i));
+            ic  = i;
+        }
+        else {
+            ni1 = kb;
+            ni2 = chameleon_min(i+kb, N2);
+            l   = chameleon_min(kb, chameleon_max(0, N2-i));
+            jc  = i;
+        }
+
+        /*
+         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb)
+         */
+        CUDA_zparfb(
+            side, trans, MorseForward, MorseColumnwise,
+            mi1, ni1, mi2, ni2, kb, l,
+            A1 + LDA1*jc+ic, LDA1,
+            A2, LDA2,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
+            WORK, LDWORK,
+            WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+    }
+    return MORSE_SUCCESS;
+}
diff --git a/cudablas/include/cudablas_z.h b/cudablas/include/cudablas_z.h
index e73912750cb83353c4f013520c5c0db7a55ba7dc..911d1ef93328392186f6063e66e9b21572596844 100644
--- a/cudablas/include/cudablas_z.h
+++ b/cudablas/include/cudablas_z.h
@@ -51,6 +51,7 @@ int CUDA_ztrmm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum
 int CUDA_ztrsm(  MORSE_enum side, MORSE_enum uplo, MORSE_enum transa, MORSE_enum diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM);
 int CUDA_ztsmlq( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
 int CUDA_ztsmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
+int CUDA_zttmqr( MORSE_enum side, MORSE_enum trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM);
 int CUDA_zunmlqt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmqrt(MORSE_enum side, MORSE_enum trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
index 665628cf4f9283f0dea3bcbe67dd3ebda5b8f590..ed4f7dbe9303d36cef5385dd33cddb54b9583fd4 100644
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ b/runtime/starpu/codelets/codelet_ztsmqr.c
@@ -281,8 +281,8 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
     starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
                                &lda1, &lda2, &ldv, &ldt, &ldwork);
 
-    WC = W + ib * (side == MorseLeft ? n1 : m1);
-    ldworkc = (side == MorseLeft) ? m1 : ib;
+    WC = W + ib * (side == MorseLeft ? m1 : n1);
+    ldworkc = (side == MorseLeft) ? m2 : ib;
 
     stream = starpu_cuda_get_local_stream();
     cublasSetKernelStream( stream );
@@ -299,7 +299,6 @@ static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
-
 /*
  * Codelet definition
  */
diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c
index 3368c6da7ecfb8bd00a7d269d98e88f487159144..6f2f600a8c83f1b299df8a951bddcb4abe019a7e 100644
--- a/runtime/starpu/codelets/codelet_zttmqr.c
+++ b/runtime/starpu/codelets/codelet_zttmqr.c
@@ -239,9 +239,59 @@ static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg)
     CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
 }
+
+#if defined(CHAMELEON_USE_CUDA)
+static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int m1;
+    int n1;
+    int m2;
+    int n2;
+    int k;
+    int ib;
+    cuDoubleComplex *A1;
+    int lda1;
+    cuDoubleComplex *A2;
+    int lda2;
+    cuDoubleComplex *V;
+    int ldv;
+    cuDoubleComplex *T;
+    int ldt;
+    cuDoubleComplex *W, *WC;
+    int ldwork;
+    int ldworkc;
+    CUstream stream;
+
+    A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    V  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
+    W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
+
+    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
+                               &lda1, &lda2, &ldv, &ldt, &ldwork);
+
+    WC = W + ib * (side == MorseLeft ? m1 : n1);
+    ldworkc = (side == MorseLeft) ? m2 : ib;
+
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
+    CUDA_zttmqr(
+            side, trans, m1, n1, m2, n2, k, ib,
+            A1, lda1, A2, lda2, V, ldv, T, ldt,
+            W, ldwork, WC, ldworkc, stream );
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS_CPU(zttmqr, 5, cl_zttmqr_cpu_func)
+CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC)