diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 6bf814a62bd42c1b1107eb759f89fdb69e2e808e..313b83eebd7e3815e2a35dcea4f7d9c8ab2f4727 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -19,7 +19,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -65,10 +65,10 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     }
 
     /*
-     * zgelqt = A->nb * (ib+1)
-     * zunmlq = A->nb * ib
-     * ztslqt = A->nb * (ib+1)
-     * ztsmlq = A->nb * ib
+     * zgelqt  = A->nb * (ib+1)
+     * zunmlq  = A->nb * ib
+     * ztplqt  = A->nb * (ib+1)
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * (ib+1);
 
@@ -76,8 +76,8 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 2 * A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c
index c91cea637129374915d28d8a9ee0743e82c7acda..7cc655e6deef5bd566445956ecf19886095816f4 100644
--- a/compute/pzgelqf_param.c
+++ b/compute/pzgelqf_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -69,10 +69,10 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_CUDA)
     /*
-     * zunmqr  = A->nb * ib
-     * ztpmqrt = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index d9b77e5487e11f6c23e95bc347dc7943a5439ca5..7cec545bbf0318e47fe5920993876608e8c50cb7 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -62,10 +62,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
     }
 
      /*
-     * zgelqt = A->nb * (ib+1)
-     * zunmlq = A->nb * ib
-     * ztplqt = A->nb * (ib+1)
-     * ztpmlq = A->nb * ib
+     * zgelqt  = A->nb * (ib+1)
+     * zunmlq  = A->nb * ib
+     * ztplqt  = A->nb * (ib+1)
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * (ib+1);
 
@@ -73,10 +73,10 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztpmqr = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 650e769a022a38c2066aadd75dfa0d776aa62f78..4ee0c4dabb4d13afdb94ea51bdce23f6b64d9fe8 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -19,7 +19,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -60,10 +60,10 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
     }
 
     /*
-     * zgeqrt = A->nb * (ib+1)
-     * zunmqr = A->nb * ib
-     * ztsqrt = A->nb * (ib+1)
-     * ztsmqr = A->nb * ib
+     * zgeqrt  = A->nb * (ib+1)
+     * zunmqr  = A->nb * ib
+     * ztpqrt  = A->nb * (ib+1)
+     * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * (ib+1);
 
@@ -71,8 +71,8 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 2 * A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c
index b297e598ff0e0bf40899be877d4942ab172dd7c6..72dcc8e5775b315a2e22f383f57e153fd1ee186e 100644
--- a/compute/pzgeqrf_param.c
+++ b/compute/pzgeqrf_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -74,10 +74,10 @@ void chameleon_pzgeqrf_param( int genD, int K,
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_CUDA)
     /*
-     * zunmqr  = A->nb * ib
-     * ztpmqrt = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index b075a0ef19c860c3952220cf58d0f826b8b394ab..74d153ae80f99c8f19bc0913470c69759a9d45f6 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -62,10 +62,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
     }
 
     /*
-     * zgeqrt = A->nb * (ib+1)
-     * zunmqr = A->nb * ib
-     * ztpqrt = A->nb * (ib+1)
-     * ztpmqr = A->nb * ib
+     * zgeqrt  = A->nb * (ib+1)
+     * zunmqr  = A->nb * ib
+     * ztpqrt  = A->nb * (ib+1)
+     * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * (ib+1);
 
@@ -73,10 +73,10 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztpmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c
index 711651d15bfcc711bb46315a7d936f4c64c18c74..623a25517808356acaa7b61373e831a9b043452b 100644
--- a/compute/pzhetrd_he2hb.c
+++ b/compute/pzhetrd_he2hb.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Hatem Ltaief
  * @author Azzam Haidar
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -74,9 +74,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
+     * zunmqr =     A->nb * ib
      * ztsmqr = 2 * A->nb * ib
-     * zherfb = A->nb * ib
+     * zherfb =     A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c
index ec1880018e8b2b0f2089e7a9cc6364d135095b5b..52d99f42850f531dc8177d30f73c12169040272b 100644
--- a/compute/pztpgqrt.c
+++ b/compute/pztpgqrt.c
@@ -14,7 +14,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -66,9 +66,9 @@ void chameleon_pztpgqrt( int KT, int L,
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * ztpmqrt = 2 * Q1->nb * ib
+     * ztpmqrt = 3 * Q1->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * Q1->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * Q1->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c
index 37de659fee2f50ac169db02532dd00082b38bc1c..0783d14da4e55f294f3b784349039485952da66d 100644
--- a/compute/pztpqrt.c
+++ b/compute/pztpqrt.c
@@ -14,7 +14,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -61,9 +61,9 @@ void chameleon_pztpqrt( int L, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *T,
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_CUDA)
     /*
-     * ztpmqrt = 2 * A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index 05ad8cdef0ae7fb131376b631559f8f14dfb7f42..492843fc3710220d31abda8728dfbaf0903555b7 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -19,7 +19,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -67,8 +67,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
     }
 
     /*
-     * zunmlq = A->nb * ib
-     * ztpmlq = A->nb * ib
+     * zunmlq  = A->nb * ib
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
@@ -76,8 +76,8 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmlq = A->nb * ib
-     * ztpmlq = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 2 * A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c
index d98a98ac70c4c7909f937e99bd588d1a95107eb2..b70080358fb49e66f3176097d98ecb9422ec44f2 100644
--- a/compute/pzunglq_param.c
+++ b/compute/pzunglq_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -63,18 +63,18 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztpmqr = A->nb * ib
+     * zunmlq  = A->nb * ib
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztpmqr = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index 95d6a7df46b5125504e2986c54e15f95ea96f94f..d31203a9eec0ad3c51fe633a2469eb065fad568e 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -18,7 +18,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2011-05-24
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -64,18 +64,18 @@ void chameleon_pzunglqrh( int genD, int BS,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztpmqr = A->nb * ib
+     * zunmlq  = A->nb * ib
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztpmqr = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index f9cc9182224acb9d6b01366f863b3d1c54fa779c..a190027288631727f1a08a2e4b1bcbe2fb69af19 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -19,7 +19,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -68,8 +68,8 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztsmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
+     * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
@@ -77,8 +77,8 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 2 * A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c
index 6f550160137776a091b9ed685ebf24dd64b1c992..8bcb901c13a962a4e199a8e0c2d64966e81ca3ea 100644
--- a/compute/pzungqr_param.c
+++ b/compute/pzungqr_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -60,17 +60,17 @@ void chameleon_pzungqr_param( int genD, int K,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztpmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
+     * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_CUDA)
     /*
-     * ztpmqrt = 2 * A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index 8fbfb9f489647bac8526a48b824380231cadb2ea..23d5e29db53f570516eee7231e4b326fde04e140 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -67,8 +67,7 @@ void chameleon_pzungqrrh( int genD, int BS,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztsmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
      * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
@@ -76,10 +75,10 @@ void chameleon_pzungqrrh( int genD, int BS,
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index faff5895e38d393a431996019f483a0ee2a9a4ef..a5d23bf2b1db09789830af95372ffd67d46d76da 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -70,16 +70,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
     }
 
     /*
-     * zunmlq = A->mb * ib
-     * ztsmlq = A->mb * ib
+     * zunmlq  = A->mb * ib
+     * ztpmlqt = A->mb * ib
      */
     ws_worker = A->mb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmlq = A->mb * ib
-     * ztsmlq = 2 * A->mb * ib
+     * zunmlq  =     A->mb * ib
+     * ztpmlqt = 2 * A->mb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->mb * 2 );
 #endif
diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c
index 01c34e52b5a9a2f61b322737125a53322394b7c8..48dbc13eaab3b75b2a75b01c567c68524041680e 100644
--- a/compute/pzunmlq_param.c
+++ b/compute/pzunmlq_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -63,19 +63,18 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
     }
 
     /*
-     * zunmlq = A->nb * ib
-     * ztsmlq = A->nb * ib
-     * zttmlq = A->nb * ib
+     * zunmlq  = A->nb * ib
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmlq = A->nb * ib
-     * ztsmlq = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 23d0381769f2878af18f28f992f446dce18e1b97..26130cb69a7071245bc341f2310b01067e35f5de 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -65,19 +65,18 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
     }
 
     /*
-     * zunmlq = A->nb * ib
-     * ztsmlq = A->nb * ib
-     * zttmlq = A->nb * ib
+     * zunmlq  = A->nb * ib
+     * ztpmlqt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmlq = A->nb * ib
-     * ztsmlq = 2 * A->nb * ib
+     * zunmlq  =     A->nb * ib
+     * ztpmlqt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index 9000c51582d83b89c351f94ac5533f88d792f0d6..4c15ba7fd0ac6ec475583f73c190c34caae6f1bb 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -70,16 +70,16 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztsmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
+     * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 2 * A->nb * ib
      */
     ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
 #endif
diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c
index 7a9b2b6e11765a47f3aaaad0e654765ca25b2b1a..772bfdf48f4310ef9a96a7de427fdc7c35a09f49 100644
--- a/compute/pzunmqr_param.c
+++ b/compute/pzunmqr_param.c
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Raphael Boucherie
- * @date 2017-05-17
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -63,8 +63,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztsmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
      * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
@@ -72,10 +71,10 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =      A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index 1073db6ac13142e4e71f6bbb761de68d4402470a..e8429f8cfeb7ae92709228dd8fab734364149944 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> s d c
  *
  */
@@ -66,8 +66,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
     }
 
     /*
-     * zunmqr = A->nb * ib
-     * ztsmqr = A->nb * ib
+     * zunmqr  = A->nb * ib
      * ztpmqrt = A->nb * ib
      */
     ws_worker = A->nb * ib;
@@ -75,10 +74,10 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 #if defined(CHAMELEON_USE_CUDA)
     /* Worker space
      *
-     * zunmqr = A->nb * ib
-     * ztsmqr = 2 * A->nb * ib
+     * zunmqr  =     A->nb * ib
+     * ztpmqrt = 3 * A->nb * ib
      */
-    ws_worker = chameleon_max( ws_worker, ib * A->nb * 2 );
+    ws_worker = chameleon_max( ws_worker, ib * A->nb * 3 );
 #endif
 
     ws_worker *= sizeof(CHAMELEON_Complex64_t);
diff --git a/coreblas/compute/core_zparfb.c b/coreblas/compute/core_zparfb.c
index 5199173acfadea031f5d656407638c7b0e9a8c18..a359402d6b90c8aa0484e1d1b587c413060561c4 100644
--- a/coreblas/compute/core_zparfb.c
+++ b/coreblas/compute/core_zparfb.c
@@ -18,7 +18,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2011-06-14
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -139,7 +139,8 @@
  */
 /* This kernel is never traced so return type on previous line for convert2eztrace.pl script */
 int
-CORE_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev,
+CORE_zparfb(cham_side_t side, cham_trans_t trans,
+            cham_dir_t direct, cham_store_t storev,
             int M1, int N1, int M2, int N2, int K, int L,
                   CHAMELEON_Complex64_t *A1, int LDA1,
                   CHAMELEON_Complex64_t *A2, int LDA2,
diff --git a/coreblas/compute/core_ztpmlqt.c b/coreblas/compute/core_ztpmlqt.c
index 72e54b1ad14c6590adb0a034f7d1c86d22df9737..bfc53b98dba91a9ca56fbf71e2b6b3faf1828c99 100644
--- a/coreblas/compute/core_ztpmlqt.c
+++ b/coreblas/compute/core_ztpmlqt.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -24,9 +24,11 @@
  *
  * @ingroup CORE_CHAMELEON_Complex64_t
  *
- * CORE_ztpmlqt applies a complex orthogonal matrix Q obtained from a
- * "triangular-pentagonal" complex block reflector H to a general complex matrix
- * C, which consists of two blocks A and B.
+ * @brief Applies a complex orthogonal matrix Q.
+ *
+ * The matrix Q is obtained from a "triangular-pentagonal" complex block
+ * reflector H to a general complex matrix C, which consists of two blocks A and
+ * B.
  *
  *******************************************************************************
  *
@@ -128,9 +130,8 @@
  *
  *******************************************************************************
  *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
  *
  */
 
diff --git a/coreblas/compute/core_zttmlq.c b/coreblas/compute/core_zttmlq.c
index 69a7004c6d9d9cd148766f822025fcdbe7f441a9..5b6ee0261ec8e920f4883847931526fc864ddf76 100644
--- a/coreblas/compute/core_zttmlq.c
+++ b/coreblas/compute/core_zttmlq.c
@@ -19,7 +19,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -127,9 +127,9 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans,
                 const CHAMELEON_Complex64_t *T, int LDT,
                 CHAMELEON_Complex64_t *WORK, int LDWORK)
 {
-    int i, i1, i3, l;
+    int i, i1, i3;
     int NW;
-    int kb;
+    int kb, l;
     int ic = 0;
     int jc = 0;
     int mi1 = M1;
@@ -205,11 +205,13 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans,
     }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft) && (trans == ChamNoTrans))
-        || ((side == ChamRight) && (trans != ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
+         ((side == ChamRight) && (trans != ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
@@ -248,13 +250,11 @@ int CORE_zttmlq(cham_side_t side, cham_trans_t trans,
         CORE_zparfb(
             side, trans, ChamDirForward, ChamRowwise,
             mi1, ni1, mi2, ni2, kb, l,
-            &A1[LDA1*jc+ic], LDA1,
+            A1 + LDA1 * jc + ic, LDA1,
             A2, LDA2,
-            &V[i], LDV,
-            &T[LDT*i], LDT,
+            V + i, LDV,
+            T + LDT * i, LDT,
             WORK, LDWORK);
     }
     return CHAMELEON_SUCCESS;
 }
-
-
diff --git a/cudablas/compute/CMakeLists.txt b/cudablas/compute/CMakeLists.txt
index d9859a604616b7b4c8e79f7bc8a32238b68eca40..4a1f8559a9c62bbad84f79c545c89075dd71229d 100644
--- a/cudablas/compute/CMakeLists.txt
+++ b/cudablas/compute/CMakeLists.txt
@@ -19,7 +19,7 @@
 #
 # @version 1.0.0
 #  @author Florent Pruvost
-#  @date 2015-09-16
+#  @date 2018-11-09
 #
 ###
 
@@ -38,11 +38,13 @@ set(ZSRC
     cuda_zsymm.c
     cuda_zsyr2k.c
     cuda_zsyrk.c
+    cuda_ztpmlqt.c
     cuda_ztpmqrt.c
     cuda_ztrmm.c
     cuda_ztrsm.c
     cuda_ztsmlq.c
     cuda_ztsmqr.c
+    cuda_zttmlq.c
     cuda_zttmqr.c
     cuda_zunmlqt.c
     cuda_zunmqrt.c
diff --git a/cudablas/compute/cuda_zlarfb.c b/cudablas/compute/cuda_zlarfb.c
index b44b22ca9225b87dae0bc7cacf0de17050a6a2ef..51fb0f3c70c33d84ea01d9499f51eeac053a0791 100644
--- a/cudablas/compute/cuda_zlarfb.c
+++ b/cudablas/compute/cuda_zlarfb.c
@@ -15,21 +15,21 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int
-CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
-            cham_dir_t direct, cham_store_t storev,
-            int M, int N, int K,
-            const cuDoubleComplex *V, int LDV,
-            const cuDoubleComplex *T, int LDT,
-                  cuDoubleComplex *C, int LDC,
-                  cuDoubleComplex *WORK, int LDWORK,
-            CUBLAS_STREAM_PARAM )
+CUDA_zlarfb( cham_side_t side, cham_trans_t trans,
+             cham_dir_t direct, cham_store_t storev,
+             int M, int N, int K,
+             const cuDoubleComplex *V, int LDV,
+             const cuDoubleComplex *T, int LDT,
+                   cuDoubleComplex *C, int LDC,
+                   cuDoubleComplex *WORK, int LDWORK,
+             CUBLAS_STREAM_PARAM )
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
@@ -67,20 +67,25 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
     }
 
     /* Quick return */
-    if ((M == 0) || (N == 0) || (K == 0))
+    if ((M == 0) || (N == 0) || (K == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
     // opposite of trans
-    if (trans == ChamNoTrans)
+    if (trans == ChamNoTrans) {
         transT = ChamConjTrans;
-    else
+    }
+    else {
         transT = ChamNoTrans;
+    }
 
     // whether T is upper or lower triangular
-    if (direct == ChamDirForward)
+    if (direct == ChamDirForward) {
         uplo = ChamUpper;
-    else
+    }
+    else {
         uplo = ChamLower;
+    }
 
     if (storev == ChamColumnwise) {
         notransV = ChamNoTrans;
@@ -106,8 +111,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
         // W = W T^H = C^H V T^H
         CUDA_ztrmm( ChamRight, uplo, transT, ChamNonUnit,
                     N, K,
-                    CUBLAS_SADDR(zone), T,    LDT,
-                                        WORK, LDWORK,
+                    &zone, T,    LDT,
+                           WORK, LDWORK,
                     CUBLAS_STREAM_VALUE );
 
         // C = C - V W^H = C - V T V^H C = (I - V T V^H) C = H C
@@ -133,8 +138,8 @@ CUDA_zlarfb(cham_side_t side, cham_trans_t trans,
         // W = W T = C V T
         CUDA_ztrmm( ChamRight, uplo, trans, ChamNonUnit,
                     M, K,
-                    CUBLAS_SADDR(zone), T,    LDT,
-                                        WORK, LDWORK,
+                    &zone, T,    LDT,
+                           WORK, LDWORK,
                     CUBLAS_STREAM_VALUE );
 
         // C = C - W V^H = C - C V T V^H = C (I - V T V^H) = C H
diff --git a/cudablas/compute/cuda_zparfb.c b/cudablas/compute/cuda_zparfb.c
index 292ac3b647cebbee27d15f7336ed80a21c8de9b0..bcef47797c5892accf0073aa9e2fd67b2cb6ab1b 100644
--- a/cudablas/compute/cuda_zparfb.c
+++ b/cudablas/compute/cuda_zparfb.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -120,40 +120,32 @@
  *         The leading dimension of the array T. LDT >= K.
  *
  * @param[in,out] WORK
- *         Workspace of dimension LDWORK-by-N1 if side == ChamLeft, LDWORK-by-K
- *         otherwise.
+ *         Workspace of dimension at least:
+ *            - K * (M2 + N2).
+ *         If L > 0, it is recommended to extend it to
+ *            - K * (2 * M2 + N2 ) if side == ChamLeft.
+ *            - K * (M2 + 2 * N2 ) if side == ChamRight.
  *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK: LDWORK >= K, if side ==
- *         ChamLeft, LDWORK >= M1 otehrwise.
- *
- * @param[in,out] WORKC
- *         Optionnal additional workspace to replace the TRMM operation by a GEMM kernel.
- *         This workspace is of dimension LDWORK-by-K if side == ChamLeft, LDWORK-by-N2
- *         otherwise.
- *
- * @param[in] LDWORKC
- *         The leading dimension of the array WORKC: LDWORKC >= M2, if side ==
- *         ChamLeft, LDWORK >= K otehrwise.
+ * @param[in] LWORK
+ *         The dimension of the array WORK. If LWORK < 0, returns immediately
+ *         the recommended workspace size.
  *
  *******************************************************************************
  *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval  <0 if -i, the i-th argument had an illegal value
+ * @retval  The recommended LWORK value, if LWORK == -1 on entry.
  */
 int
-CUDA_zparfb(cham_side_t side, cham_trans_t trans,
-            cham_dir_t direct, cham_store_t storev,
-            int M1, int N1, int M2, int N2, int K, int L,
-                  cuDoubleComplex *A1, int LDA1,
-                  cuDoubleComplex *A2, int LDA2,
-            const cuDoubleComplex *V, int LDV,
-            const cuDoubleComplex *T, int LDT,
-                  cuDoubleComplex *WORK, int LDWORK,
-                  cuDoubleComplex *WORKC, int LDWORKC,
-            CUBLAS_STREAM_PARAM )
+CUDA_zparfb( cham_side_t side, cham_trans_t trans,
+             cham_dir_t direct, cham_store_t storev,
+             int M1, int N1, int M2, int N2, int K, int L,
+                   cuDoubleComplex *A1, int LDA1,
+                   cuDoubleComplex *A2, int LDA2,
+             const cuDoubleComplex *V, int LDV,
+             const cuDoubleComplex *T, int LDT,
+                   cuDoubleComplex *WORK, int LWORK,
+             CUBLAS_STREAM_PARAM )
 {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     cuDoubleComplex zzero = make_cuDoubleComplex(0.0, 0.0);
@@ -165,9 +157,13 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
     double mzone = -1.0;
 #endif /* defined(PRECISION_z) || defined(PRECISION_c) */
 
+    cuDoubleComplex *workW, *workC, *workV;
+    int ldW, ldC, ldV;
     int j;
     cham_trans_t transW;
     cham_trans_t transA2;
+    int wssize = 0;
+    int wrsize = 0;
 
     CUBLAS_GET_STREAM;
 
@@ -201,19 +197,30 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
     if (K < 0) {
         return -9;
     }
-    if ( ((LDWORK < K ) && (side == ChamLeft )) ||
-         ((LDWORK < M1) && (side == ChamRight)) ) {
+
+    if (direct == ChamDirForward) {
+        wssize = K * (M2 + N2);
+        wrsize = wssize;
+        if ( L > 0 ) {
+            wrsize +=  (side == ChamLeft) ? M2 * K : K * N2;
+        }
+    }
+
+    if ( LWORK < 0 ) {
+        return wrsize;
+    }
+    else if ( LWORK < wssize ) {
+        cudablas_error(20, "Illegal value of LWORK");
         return -20;
     }
 
-    /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
     if (direct == ChamDirForward) {
 
         if (side == ChamLeft) {
-
             /*
              * Column or Rowwise / Forward / Left
              * ----------------------------------
@@ -222,76 +229,137 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
              *                                     ( A2 )
              */
 
+            /*
+             * Store in WORK (N1 == N2):
+             *    - Workspace W for the copy of A1 + V' * A2 (K  x N1)
+             *    - Workspace C for the copy of V * T        (M2 x K )
+             *    - Workspace V for the copy of V            (M2 x K )
+             */
+            workW = WORK;
+            ldW = K;
+
+            workC = workW + K * N1;
+            ldC = M2;
+
+            if ( L == 0 ) {
+                workV = (cuDoubleComplex*)V;
+                ldV   = LDV;
+            }
+            else {
+                if ( LWORK < wrsize ) {
+                    workC = NULL;
+                    workV = workW + K * N1;
+                }
+                else {
+                    workV = workC + M2 * K;
+                }
+
+                if ( storev == ChamColumnwise ) {
+                    ldV = M2;
+
+                    /*
+                     * Backup V, and put 0 in the lower part
+                     */
+                    cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                       V,     LDV * sizeof(cuDoubleComplex),
+                                       M2 * sizeof(cuDoubleComplex), K,
+                                       cudaMemcpyDeviceToDevice, stream );
+
+                    for(j = 1; j < K; j++) {
+                        cudaMemsetAsync( workV + (j-1) * ldV + M2 - L + j,
+                                         0,
+                                         (L - j) * sizeof(cuDoubleComplex),
+                                         stream );
+                    }
+                }
+                else {
+                    ldV = K;
+
+                    /*
+                     * Backup V, and put 0 in the lower part
+                     */
+                    cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                       V,     LDV * sizeof(cuDoubleComplex),
+                                       K * sizeof(cuDoubleComplex), M2,
+                                       cudaMemcpyDeviceToDevice, stream );
+
+                    for(j = 1; j < K; j++) {
+                        cudaMemsetAsync( workV + ldV * ( M2 - L + j ),
+                                         0,
+                                         j * sizeof(cuDoubleComplex),
+                                         stream );
+                    }
+                }
+            }
+
             /*
              * W = A1 + V' * A2:
              *      W = A1
              *      W = W + V' * A2
              *
              */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
+            cudaMemcpy2DAsync( workW, ldW  * sizeof(cuDoubleComplex),
+                               A1,    LDA1 * sizeof(cuDoubleComplex),
                                K * sizeof(cuDoubleComplex), N1,
                                cudaMemcpyDeviceToDevice, stream );
 
             transW  = storev == ChamColumnwise ? ChamConjTrans : ChamNoTrans;
             transA2 = storev == ChamColumnwise ? ChamNoTrans : ChamConjTrans;
 
-            cublasZgemm(CUBLAS_HANDLE
-                        chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
-                        K, N1, M2,
-                        CUBLAS_SADDR(zone),
-                        V     /* K*M2  */, LDV,
-                        A2    /* M2*N1 */, LDA2,
-                        CUBLAS_SADDR(zone),
-                        WORK  /* K*N1  */, LDWORK);
-
-            if (WORKC == NULL) {
+            cublasZgemm( CUBLAS_HANDLE
+                         chameleon_cublas_const(transW), chameleon_cublas_const(ChamNoTrans),
+                         K, N1, M2,
+                         CUBLAS_SADDR(zone), workV /* M2*K  */, ldV,
+                                             A2    /* M2*N2 */, LDA2,
+                         CUBLAS_SADDR(zone), workW /* K *N2 */, ldW );
+
+            if ( workC == NULL ) {
                 /* W = op(T) * W */
                 CUDA_ztrmm( ChamLeft, ChamUpper, trans, ChamNonUnit,
                             K, N2,
-                            CUBLAS_SADDR(zone), T,    LDT,
-                                                WORK, LDWORK,
+                            &zone, T,     LDT,
+                                   workW, ldW,
                             CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - op(T) * W */
                 for(j = 0; j < N1; j++) {
-                    cublasZaxpy(CUBLAS_HANDLE
-                                K, CUBLAS_SADDR(mzone),
-                                (WORK + LDWORK*j), 1,
-                                (A1 + LDA1*j),     1);
+                    cublasZaxpy( CUBLAS_HANDLE
+                                 K, CUBLAS_SADDR(mzone),
+                                 workW + ldW  * j, 1,
+                                 A1    + LDA1 * j, 1 );
                 }
 
                 /* A2 = A2 - op(V) * W  */
-                cublasZgemm(CUBLAS_HANDLE
-                            chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
-                            M2, N2, K,
-                            CUBLAS_SADDR(mzone), V    /* M2*K  */, LDV,
-                                                 WORK /* K*N2  */, LDWORK,
-                            CUBLAS_SADDR(zone),  A2   /* m2*N2 */, LDA2);
+                cublasZgemm( CUBLAS_HANDLE
+                             chameleon_cublas_const(transA2), chameleon_cublas_const(ChamNoTrans),
+                             M2, N2, K,
+                             CUBLAS_SADDR(mzone), workV /* M2 * K  */, ldV,
+                                                  workW /* K  * N2 */, ldW,
+                             CUBLAS_SADDR(zone),  A2    /* M2 * N2 */, LDA2 );
 
             } else {
                 /* Wc = V * op(T) */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(transA2), chameleon_cublas_const(trans),
                              M2, K, K,
-                             CUBLAS_SADDR(zone),  V, LDV,
-                                                  T, LDT,
-                             CUBLAS_SADDR(zzero), WORKC, LDWORKC );
+                             CUBLAS_SADDR(zone),  workV, ldV,
+                                                  T,     LDT,
+                             CUBLAS_SADDR(zzero), workC, ldC );
 
                 /* A1 = A1 - opt(T) * W */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(trans), chameleon_cublas_const(ChamNoTrans),
                              K, N1, K,
-                             CUBLAS_SADDR(mzone), T,    LDT,
-                                                  WORK, LDWORK,
-                             CUBLAS_SADDR(zone),  A1,   LDA1 );
+                             CUBLAS_SADDR(mzone), T,     LDT,
+                                                  workW, ldW,
+                             CUBLAS_SADDR(zone),  A1,    LDA1 );
 
                 /* A2 = A2 - Wc * W */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
                              M2, N2, K,
-                             CUBLAS_SADDR(mzone), WORKC, LDWORKC,
-                                                  WORK,  LDWORK,
+                             CUBLAS_SADDR(mzone), workC, ldC,
+                                                  workW, ldW,
                              CUBLAS_SADDR(zone),  A2,    LDA2 );
             }
         }
@@ -304,14 +372,77 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
              *
              */
 
+            /*
+             * Store in WORK (M1 == M2):
+             *    - Workspace W for the copy of A1 + A2 * V' (M1 x K )
+             *    - Workspace C for the copy of V * T        (K  x N2)
+             *    - Workspace V for the copy of V            (K  x N2)
+             */
+            workW = WORK;
+            ldW = M1;
+
+            workC = workW + M1 * K;
+            ldC = K;
+
+            if ( L == 0 ) {
+                workV = (cuDoubleComplex*)V;
+                ldV   = LDV;
+            }
+            else {
+                if ( LWORK < wrsize ) {
+                    workC = NULL;
+                    workV = workW + M2 * K;
+                }
+                else {
+                    workV = workC + K * N2;
+                }
+
+                if ( storev == ChamColumnwise ) {
+                    ldV = N2;
+
+                    /*
+                     * Backup V, and put 0 in the lower part
+                     */
+                    cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                       V,     LDV * sizeof(cuDoubleComplex),
+                                       N2 * sizeof(cuDoubleComplex), K,
+                                       cudaMemcpyDeviceToDevice, stream );
+
+                    for(j = 1; j < K; j++) {
+                        cudaMemsetAsync( workV + (j-1) * ldV + N2 - L + j,
+                                         0,
+                                         (L - j) * sizeof(cuDoubleComplex),
+                                         stream );
+                    }
+                }
+                else {
+                    ldV = K;
+
+                    /*
+                     * Backup V, and put 0 in the upper part
+                     */
+                    cudaMemcpy2DAsync( workV, ldV * sizeof(cuDoubleComplex),
+                                       V,     LDV * sizeof(cuDoubleComplex),
+                                       K * sizeof(cuDoubleComplex), N2,
+                                       cudaMemcpyDeviceToDevice, stream );
+
+                    for(j = 1; j < K; j++) {
+                        cudaMemsetAsync( workV + ldV * ( N2 - L + j ),
+                                         0,
+                                         j * sizeof(cuDoubleComplex),
+                                         stream );
+                    }
+                }
+            }
+
             /*
              * W = A1 + A2 * V':
              *      W = A1
              *      W = W + A2 * V'
              *
              */
-            cudaMemcpy2DAsync( WORK, LDWORK * sizeof(cuDoubleComplex),
-                               A1,   LDA1   * sizeof(cuDoubleComplex),
+            cudaMemcpy2DAsync( workW, ldW  * sizeof(cuDoubleComplex),
+                               A1,    LDA1 * sizeof(cuDoubleComplex),
                                M1 * sizeof(cuDoubleComplex), K,
                                cudaMemcpyDeviceToDevice, stream );
 
@@ -321,40 +452,40 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
             cublasZgemm(CUBLAS_HANDLE
                         chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transW),
                         M1, K, N2,
-                        CUBLAS_SADDR(zone), A2   /* M1*N2 */, LDA2,
-                                            V    /* N2*K  */, LDV,
-                        CUBLAS_SADDR(zone), WORK /* M1*K  */, LDWORK);
+                        CUBLAS_SADDR(zone), A2    /* M1*N2 */, LDA2,
+                                            workV /* K *N2 */, ldV,
+                        CUBLAS_SADDR(zone), workW /* M1*K  */, ldW);
 
-            if (WORKC == NULL) {
+            if ( workC == NULL ) {
                 /* W = W * op(T) */
                 CUDA_ztrmm( ChamRight, ChamUpper, trans, ChamNonUnit,
                             M2, K,
-                            CUBLAS_SADDR(zone), T,    LDT,
-                                                WORK, LDWORK,
+                            &zone, T,     LDT,
+                                   workW, ldW,
                             CUBLAS_STREAM_VALUE );
 
                 /* A1 = A1 - W = A1 - W * op(T) */
                 for(j = 0; j < K; j++) {
-                    cublasZaxpy(CUBLAS_HANDLE
-                                M1, CUBLAS_SADDR(mzone),
-                                (WORK + LDWORK*j), 1,
-                                (A1 + LDA1*j), 1);
+                    cublasZaxpy( CUBLAS_HANDLE
+                                 M1, CUBLAS_SADDR(mzone),
+                                 workW + ldW  * j, 1,
+                                 A1    + LDA1 * j, 1 );
                 }
 
                 /* A2 = A2 - W * op(V)  */
                 cublasZgemm(CUBLAS_HANDLE
                             chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(transA2),
                             M2, N2, K,
-                            CUBLAS_SADDR(mzone), WORK /* M2*K  */, LDWORK,
-                                                 V    /* K*N2  */, LDV,
-                            CUBLAS_SADDR(zone),  A2   /* M2*N2 */, LDA2);
+                            CUBLAS_SADDR(mzone), workW /* M2*K  */, ldW,
+                                                 workV /* K *N2 */, ldV,
+                            CUBLAS_SADDR(zone),  A2    /* M2*N2 */, LDA2);
 
             } else {
                 /* A1 = A1 - W * opt(T) */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(trans),
                              M1, K, K,
-                             CUBLAS_SADDR(mzone), WORK, LDWORK,
+                             CUBLAS_SADDR(mzone), workW, ldW,
                                                   T,    LDT,
                              CUBLAS_SADDR(zone),  A1,   LDA1 );
 
@@ -363,15 +494,15 @@ CUDA_zparfb(cham_side_t side, cham_trans_t trans,
                              chameleon_cublas_const(trans), chameleon_cublas_const(transA2),
                              K, N2, K,
                              CUBLAS_SADDR(zone),  T,     LDT,
-                                                  V,     LDV,
-                             CUBLAS_SADDR(zzero), WORKC, LDWORKC );
+                                                  workV, ldV,
+                             CUBLAS_SADDR(zzero), workC, ldC );
 
                 /* A2 = A2 - W * Wc */
                 cublasZgemm( CUBLAS_HANDLE
                              chameleon_cublas_const(ChamNoTrans), chameleon_cublas_const(ChamNoTrans),
                              M2, N2, K,
-                             CUBLAS_SADDR(mzone), WORK,  LDWORK,
-                                                  WORKC, LDWORKC,
+                             CUBLAS_SADDR(mzone), workW, ldW,
+                                                  workC, ldC,
                              CUBLAS_SADDR(zone),  A2,    LDA2 );
             }
         }
diff --git a/cudablas/compute/cuda_ztpmlqt.c b/cudablas/compute/cuda_ztpmlqt.c
new file mode 100644
index 0000000000000000000000000000000000000000..4f01e0e28cf34690cc3e3daa1823c418e9557fac
--- /dev/null
+++ b/cudablas/compute/cuda_ztpmlqt.c
@@ -0,0 +1,184 @@
+/**
+ *
+ * @file cuda_ztpmlqt.c
+ *
+ * @copyright 2009-2016 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_ztpmlqt GPU kernel
+ *
+ * @version 1.0.0
+ * @author Mathieu Faverge
+ * @date 2018-11-09
+ * @precisions normal z -> c d s
+ *
+ */
+#include "cudablas.h"
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * @brief Applies a complex orthogonal matrix Q.
+ *
+ * The matrix Q is obtained from a "triangular-pentagonal" complex block
+ * reflector H to a general complex matrix C, which consists of two blocks A and
+ * B.
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg ChamLeft  : apply Q or Q**H from the Left;
+ *         @arg ChamRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg ChamNoTrans   :  No transpose, apply Q;
+ *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N
+ *         The number of columns of the tile B. N >= 0.
+ *
+ * @param[in] K
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          K >= L >= 0.  See Further Details.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTPQRT in the first k rows of its array argument V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[in] T
+ *         The IB-by-N1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[in,out] A
+ *         A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
+ *         or (LDA,K) if SIDE = ChamRight
+ *         On entry, the K-by-N or M-by-K matrix A.
+ *         On exit, A is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A. LDA >= max(1,M).
+ *         If side = ChamLeft,  LDA >= max(1,K);
+ *         If side = Chamright, LDA >= max(1,M).
+ *
+ * @param[in,out] B
+ *         On entry, the M-by-N tile B.
+ *         On exit, B is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDB
+ *         The leading dimension of the tile B. LDB >= max(1,M).
+ *
+ * @param[out] WORK
+ *         Workspace array of size LDWORK-by-NB.
+ *         LDWORK = N if side = ChamLeft, or  M if side = ChamRight.
+ *
+ *******************************************************************************
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The columns of the pentagonal matrix V contain the elementary reflectors
+ *  H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
+ *  trapezoidal block V2:
+ *
+ *        V = [V1] [V2].
+ *
+ *  The size of the trapezoidal block V2 is determined by the parameter L,
+ *  where 0 <= L <= K; V2 is lower trapezoidal, consisting of the first L
+ *  rows of a K-by-K upper triangular matrix.  If L=K, V2 is lower triangular;
+ *  if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
+ *
+ *  If side = ChamLeft:  C = [A]  where A is K-by-N,  B is M-by-N and V is K-by-M.
+ *                            [B]
+ *
+ *  If side = ChamRight: C = [A B]  where A is M-by-K, B is M-by-N and V is K-by-N.
+ *
+ *  The complex orthogonal matrix Q is formed from V and T.
+ *
+ *  If trans='N' and side='L', C is on exit replaced with Q * C.
+ *
+ *  If trans='C' and side='L', C is on exit replaced with Q**H * C.
+ *
+ *  If trans='N' and side='R', C is on exit replaced with C * Q.
+ *
+ *  If trans='C' and side='R', C is on exit replaced with C * Q**H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
+int
+CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans,
+              int M, int N, int K, int L, int IB,
+              const cuDoubleComplex *V, int LDV,
+              const cuDoubleComplex *T, int LDT,
+                    cuDoubleComplex *A, int LDA,
+                    cuDoubleComplex *B, int LDB,
+                    cuDoubleComplex *WORK, int lwork,
+              CUBLAS_STREAM_PARAM )
+{
+    int m1, n1;
+
+    /* Check input arguments */
+    if ((side != ChamLeft) && (side != ChamRight)) {
+        cudablas_error(1, "Illegal value of side");
+        return -1;
+    }
+
+    if ( side == ChamLeft ) {
+        m1 = K;
+        n1 = N;
+    }
+    else {
+        m1 = M;
+        n1 = K;
+    }
+
+    /* TS case */
+    if (L == 0) {
+        CUDA_ztsmlq( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, lwork,
+                     CUBLAS_STREAM_VALUE );
+    }
+    /* TT case */
+    else  if( L == N ) {
+        CUDA_zttmlq( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, lwork,
+                     CUBLAS_STREAM_VALUE );
+    }
+    else {
+        cudablas_error(-6, "TPMLQT not available on GPU for general cases yet\n" );
+        return -6;
+    }
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/cudablas/compute/cuda_ztpmqrt.c b/cudablas/compute/cuda_ztpmqrt.c
index 2719edd32b1fef478dd150f67b89ba0c8dc465e2..c7a19fd76726491b982c0da9be7c0ff444fd829c 100644
--- a/cudablas/compute/cuda_ztpmqrt.c
+++ b/cudablas/compute/cuda_ztpmqrt.c
@@ -13,12 +13,128 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ * @brief Applies a complex orthogonal matrix Q.
+ *
+ * The matrix Q is obtained from a "triangular-pentagonal" complex block
+ * reflector H to a general complex matrix C, which consists of two blocks A and
+ * B.
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg ChamLeft  : apply Q or Q**H from the Left;
+ *         @arg ChamRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg ChamNoTrans   :  No transpose, apply Q;
+ *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N
+ *         The number of columns of the tile B. N >= 0.
+ *
+ * @param[in] K
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          K >= L >= 0.  See Further Details.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTPQRT in the first k rows of its array argument V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[in] T
+ *         The IB-by-N1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[in,out] A
+ *         A is COMPLEX*16 array, dimension (LDA,N) if side = ChamLeft
+ *         or (LDA,K) if SIDE = ChamRight
+ *         On entry, the K-by-N or M-by-K matrix A.
+ *         On exit, A is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A. LDA >= max(1,M).
+ *         If side = ChamLeft,  LDA >= max(1,K);
+ *         If side = Chamright, LDA >= max(1,M).
+ *
+ * @param[in,out] B
+ *         On entry, the M-by-N tile B.
+ *         On exit, B is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDB
+ *         The leading dimension of the tile B. LDB >= max(1,M).
+ *
+ * @param[out] WORK
+ *         Workspace array of size LDWORK-by-NB.
+ *         LDWORK = N if side = ChamLeft, or  M if side = ChamRight.
+ *
+ *******************************************************************************
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The columns of the pentagonal matrix V contain the elementary reflectors
+ *  H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
+ *  trapezoidal block V2:
+ *
+ *        V = [V1]
+ *            [V2].
+ *
+ *  The size of the trapezoidal block V2 is determined by the parameter L,
+ *  where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L
+ *  rows of a K-by-K upper triangular matrix.  If L=K, V2 is upper triangular;
+ *  if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
+ *
+ *  If side = ChamLeft:  C = [A]  where A is K-by-N,  B is M-by-N and V is M-by-K.
+ *                            [B]
+ *
+ *  If side = ChamRight: C = [A B]  where A is M-by-K, B is M-by-N and V is N-by-K.
+ *
+ *  The complex orthogonal matrix Q is formed from V and T.
+ *
+ *  If trans='N' and side='L', C is on exit replaced with Q * C.
+ *
+ *  If trans='C' and side='L', C is on exit replaced with Q**H * C.
+ *
+ *  If trans='N' and side='R', C is on exit replaced with C * Q.
+ *
+ *  If trans='C' and side='R', C is on exit replaced with C * Q**H.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if -i, the i-th argument had an illegal value
+ *
+ */
 int
 CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
               int M, int N, int K, int L, int IB,
@@ -26,10 +142,10 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
               const cuDoubleComplex *T, int LDT,
                     cuDoubleComplex *A, int LDA,
                     cuDoubleComplex *B, int LDB,
-                    cuDoubleComplex *WORK,
+                    cuDoubleComplex *WORK, int lwork,
               CUBLAS_STREAM_PARAM )
 {
-    int m1, n1, ldwork, ldworkc, ws;
+    int m1, n1;
 
     /* Check input arguments */
     if ((side != ChamLeft) && (side != ChamRight)) {
@@ -40,30 +156,24 @@ CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans,
     if ( side == ChamLeft ) {
         m1 = K;
         n1 = N;
-        ldwork  = IB;
-        ldworkc = M;
-        ws = ldwork * n1;
     }
     else {
         m1 = M;
         n1 = K;
-        ldwork = chameleon_max( K, chameleon_max( M, N ) );
-        ldworkc = IB;
-        ws = ldwork * IB;
     }
 
     /* TS case */
     if (L == 0) {
         CUDA_ztsmqr( side, trans, m1, n1, M, N, K, IB,
                      A, LDA, B, LDB, V, LDV, T, LDT,
-                     WORK, ldwork, WORK + ws, ldworkc,
+                     WORK, lwork,
                      CUBLAS_STREAM_VALUE );
     }
     /* TT case */
     else  if( L == M ) {
         CUDA_zttmqr( side, trans, m1, n1, M, N, K, IB,
                      A, LDA, B, LDB, V, LDV, T, LDT,
-                     WORK, ldwork, WORK + ws, ldworkc,
+                     WORK, lwork,
                      CUBLAS_STREAM_VALUE );
     }
     else {
diff --git a/cudablas/compute/cuda_ztrmm.c b/cudablas/compute/cuda_ztrmm.c
index 390311e0834fd5b126f912b033b2dc5748c6b2e9..b2809c779ce0e00003a3d5b22e26e1fefb63805f 100644
--- a/cudablas/compute/cuda_ztrmm.c
+++ b/cudablas/compute/cuda_ztrmm.c
@@ -20,13 +20,13 @@
 #include "cudablas.h"
 
 int CUDA_ztrmm(
-        cham_side_t side, cham_uplo_t uplo,
-        cham_trans_t transa, cham_diag_t diag,
-        int m, int n,
-        cuDoubleComplex *alpha,
-        const cuDoubleComplex *A, int lda,
-        cuDoubleComplex *B, int ldb,
-        CUBLAS_STREAM_PARAM)
+    cham_side_t side, cham_uplo_t uplo,
+    cham_trans_t transa, cham_diag_t diag,
+    int m, int n,
+    cuDoubleComplex *alpha,
+    const cuDoubleComplex *A, int lda,
+    cuDoubleComplex *B, int ldb,
+    CUBLAS_STREAM_PARAM)
 {
 
 #if defined(CHAMELEON_USE_CUBLAS_V2)
diff --git a/cudablas/compute/cuda_ztsmlq.c b/cudablas/compute/cuda_ztsmlq.c
index 7d2e3c4a3fbc19d5ec319991a9d56a537608016b..2dcfc24158a77a0427ed00bd294a72b55a97f238 100644
--- a/cudablas/compute/cuda_ztsmlq.c
+++ b/cudablas/compute/cuda_ztsmlq.c
@@ -13,27 +13,26 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @author Mathieu Faverge
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int CUDA_ztsmlq(
-        cham_side_t side, cham_trans_t trans,
-        int M1, int N1,
-        int M2, int N2,
-        int K, int IB,
-              cuDoubleComplex *A1,    int LDA1,
-              cuDoubleComplex *A2,    int LDA2,
-        const cuDoubleComplex *V,     int LDV,
-        const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
-        CUBLAS_STREAM_PARAM)
+    cham_side_t side, cham_trans_t trans,
+    int M1, int N1,
+    int M2, int N2,
+    int K, int IB,
+    cuDoubleComplex *A1,    int LDA1,
+    cuDoubleComplex *A2,    int LDA2,
+    const cuDoubleComplex *V,     int LDV,
+    const cuDoubleComplex *T,     int LDT,
+    cuDoubleComplex *WORK,  int LWORK,
+    CUBLAS_STREAM_PARAM)
 {
     int i, i1, i3;
-    int NW;
     int kb;
     int ic = 0;
     int jc = 0;
@@ -45,14 +44,6 @@ int CUDA_ztsmlq(
         return -1;
     }
 
-    /* NW is the minimum dimension of WORK */
-    if (side == ChamLeft) {
-        NW = IB;
-    }
-    else {
-        NW = N1;
-    }
-
     if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
         return -2;
     }
@@ -90,21 +81,20 @@ int CUDA_ztsmlq(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft) && (trans == ChamNoTrans))
-        || ((side == ChamRight) && (trans != ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans == ChamNoTrans))  ||
+         ((side == ChamRight) && (trans != ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
@@ -115,7 +105,7 @@ int CUDA_ztsmlq(
         trans = ChamNoTrans;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -137,13 +127,13 @@ int CUDA_ztsmlq(
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
         CUDA_zparfb(
-                side, trans, ChamDirForward, ChamRowwise,
-                mi, ni, M2, N2, kb, 0,
-                A1 + LDA1*jc+ic, LDA1,
-                A2, LDA2,
-                V + i, LDV,
-                T + LDT*i, LDT,
-                WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            side, trans, ChamDirForward, ChamRowwise,
+            mi, ni, M2, N2, kb, 0,
+            A1 + LDA1*jc+ic, LDA1,
+            A2, LDA2,
+            V + i, LDV,
+            T + LDT*i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_ztsmqr.c b/cudablas/compute/cuda_ztsmqr.c
index 4b07b9b9ce16397eabf8e358d81b7499eeab2c39..e731dfbf451bb06ab0d9f519292b529c04bfe00a 100644
--- a/cudablas/compute/cuda_ztsmqr.c
+++ b/cudablas/compute/cuda_ztsmqr.c
@@ -13,27 +13,27 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @author Mathieu Faverge
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
 #include "cudablas.h"
 
 int CUDA_ztsmqr(
-        cham_side_t side, cham_trans_t trans,
-        int M1, int N1,
-        int M2, int N2,
-        int K, int IB,
-              cuDoubleComplex *A1,    int LDA1,
-              cuDoubleComplex *A2,    int LDA2,
-        const cuDoubleComplex *V,     int LDV,
-        const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
-        CUBLAS_STREAM_PARAM)
+    cham_side_t side, cham_trans_t trans,
+    int M1, int N1,
+    int M2, int N2,
+    int K, int IB,
+    cuDoubleComplex *A1,    int LDA1,
+    cuDoubleComplex *A2,    int LDA2,
+    const cuDoubleComplex *V,     int LDV,
+    const cuDoubleComplex *T,     int LDT,
+    cuDoubleComplex *WORK,  int LWORK,
+    CUBLAS_STREAM_PARAM)
 {
     int i, i1, i3;
-    int NQ, NW;
+    int NQ;
     int kb;
     int ic = 0;
     int jc = 0;
@@ -48,11 +48,9 @@ int CUDA_ztsmqr(
     /* NQ is the order of Q */
     if (side == ChamLeft) {
         NQ = M2;
-        NW = IB;
     }
     else {
         NQ = N2;
-        NW = M1;
     }
 
     if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
@@ -92,25 +90,24 @@ int CUDA_ztsmqr(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft)  && (trans != ChamNoTrans))
-        || ((side == ChamRight) && (trans == ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans != ChamNoTrans))  ||
+         ((side == ChamRight) && (trans == ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -127,17 +124,18 @@ int CUDA_ztsmqr(
             ni = N1 - i;
             jc = i;
         }
+
         /*
          * Apply H or H' (NOTE: CORE_zparfb used to be CORE_ztsrfb)
          */
         CUDA_zparfb(
-                side, trans, ChamDirForward, ChamColumnwise,
-                mi, ni, M2, N2, kb, 0,
-                A1 + LDA1*jc+ic, LDA1,
-                A2, LDA2,
-                V + LDV*i, LDV,
-                T + LDT*i, LDT,
-                WORK, LDWORK, WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            side, trans, ChamDirForward, ChamColumnwise,
+            mi, ni, M2, N2, kb, 0,
+            A1 + LDA1*jc+ic, LDA1,
+            A2, LDA2,
+            V + LDV*i, LDV,
+            T + LDT*i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/compute/cuda_zttmlq.c b/cudablas/compute/cuda_zttmlq.c
new file mode 100644
index 0000000000000000000000000000000000000000..c194adfe7f7c76847a4b64feaa9b0d5b560eb78b
--- /dev/null
+++ b/cudablas/compute/cuda_zttmlq.c
@@ -0,0 +1,140 @@
+/**
+ *
+ * @file cuda_zttmlq.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_zttmlq GPU kernel
+ *
+ * @version 1.0.0
+ * @author Florent Pruvost
+ * @author Mathieu Faverge
+ * @date 2018-11-09
+ * @precisions normal z -> c d s
+ *
+ */
+#include "cudablas.h"
+
+int CUDA_zttmlq(
+        cham_side_t side, cham_trans_t trans,
+        int M1, int N1,
+        int M2, int N2,
+        int K, int IB,
+              cuDoubleComplex *A1,    int LDA1,
+              cuDoubleComplex *A2,    int LDA2,
+        const cuDoubleComplex *V,     int LDV,
+        const cuDoubleComplex *T,     int LDT,
+              cuDoubleComplex *WORK,  int LWORK,
+        CUBLAS_STREAM_PARAM)
+{
+    int i, i1, i3;
+    int kb, l;
+    int ic = 0;
+    int jc = 0;
+    int mi1 = M1;
+    int mi2 = M2;
+    int ni1 = N1;
+    int ni2 = N2;
+
+    /* Check input arguments */
+    if ((side != ChamLeft) && (side != ChamRight)) {
+        return -1;
+    }
+
+    if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
+        return -2;
+    }
+    if (M1 < 0) {
+        return -3;
+    }
+    if (N1 < 0) {
+        return -4;
+    }
+    if ( (M2 < 0) ||
+         ( (M2 != M1) && (side == ChamRight) ) ){
+        return -5;
+    }
+    if ( (N2 < 0) ||
+         ( (N2 != N1) && (side == ChamLeft) ) ){
+        return -6;
+    }
+    if ((K < 0) ||
+        ( (side == ChamLeft)  && (K > M1) ) ||
+        ( (side == ChamRight) && (K > N1) ) ) {
+        return -7;
+    }
+    if (IB < 0) {
+        return -8;
+    }
+    if (LDA1 < chameleon_max(1,M1)){
+        return -10;
+    }
+    if (LDA2 < chameleon_max(1,M2)){
+        return -12;
+    }
+    if (LDV < chameleon_max(1,K)){
+        return -14;
+    }
+    if (LDT < chameleon_max(1,IB)){
+        return -16;
+    }
+
+    /* Quick return */
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    if ( ((side == ChamLeft ) && (trans == ChamNoTrans)) ||
+         ((side == ChamRight) && (trans != ChamNoTrans)) )
+    {
+        i1 = 0;
+        i3 = IB;
+    }
+    else {
+        i1 = ( ( K-1 ) / IB )*IB;
+        i3 = -IB;
+    }
+
+    /* Transpose */
+    if (trans == ChamNoTrans) {
+        trans = ChamConjTrans;
+    }
+    else {
+        trans = ChamNoTrans;
+    }
+
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
+        kb = chameleon_min(IB, K-i);
+
+        if (side == ChamLeft) {
+            mi1 = kb;
+            mi2 = chameleon_min(i+kb, M2);
+            l   = chameleon_min(kb, chameleon_max(0, M2-i));
+            ic  = i;
+        }
+        else {
+            ni1 = kb;
+            ni2 = chameleon_min(i+kb, N2);
+            l   = chameleon_min(kb, chameleon_max(0, N2-i));
+            jc  = i;
+        }
+
+        /*
+         * Apply H or H' (NOTE: CORE_zparfb used to be CORE_zttrfb)
+         */
+        CUDA_zparfb(
+            side, trans, ChamDirForward, ChamRowwise,
+            mi1, ni1, mi2, ni2, kb, l,
+            A1 + LDA1 * jc + ic, LDA1,
+            A2, LDA2,
+            V + i,       LDV,
+            T + LDT * i, LDT,
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
+    }
+    return CHAMELEON_SUCCESS;
+}
diff --git a/cudablas/compute/cuda_zttmqr.c b/cudablas/compute/cuda_zttmqr.c
index 236405cdf080fd3c7941aa62426a70cde2e5d8f7..8664d0675afdb82b60e92bf054caa45122c7ec89 100644
--- a/cudablas/compute/cuda_zttmqr.c
+++ b/cudablas/compute/cuda_zttmqr.c
@@ -6,6 +6,7 @@
  *                      Tennessee Research Foundation. All rights reserved.
  * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
  *                      Univ. Bordeaux. All rights reserved.
+ *
  ***
  *
  * @brief Chameleon cuda_zttmqr GPU kernel
@@ -13,7 +14,7 @@
  * @version 1.0.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -28,13 +29,12 @@ int CUDA_zttmqr(
               cuDoubleComplex *A2,    int LDA2,
         const cuDoubleComplex *V,     int LDV,
         const cuDoubleComplex *T,     int LDT,
-              cuDoubleComplex *WORK,  int LDWORK,
-              cuDoubleComplex *WORKC, int LDWORKC,
+              cuDoubleComplex *WORK,  int LWORK,
         CUBLAS_STREAM_PARAM)
 {
-    int i, i1, i3, l;
-    int NQ, NW;
-    int kb;
+    int i, i1, i3;
+    int NQ;
+    int kb, l;
     int ic = 0;
     int jc = 0;
     int mi1 = M1;
@@ -50,11 +50,9 @@ int CUDA_zttmqr(
     /* NQ is the order of Q */
     if (side == ChamLeft) {
         NQ = M2;
-        NW = IB;
     }
     else {
         NQ = N2;
-        NW = M1;
     }
 
     if ((trans != ChamNoTrans) && (trans != ChamConjTrans)) {
@@ -94,25 +92,24 @@ int CUDA_zttmqr(
     if (LDT < chameleon_max(1,IB)){
         return -16;
     }
-    if (LDWORK < chameleon_max(1,NW)){
-        return -18;
-    }
 
     /* Quick return */
-    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0))
+    if ((M1 == 0) || (N1 == 0) || (M2 == 0) || (N2 == 0) || (K == 0) || (IB == 0)) {
         return CHAMELEON_SUCCESS;
+    }
 
-    if (((side == ChamLeft)  && (trans != ChamNoTrans))
-        || ((side == ChamRight) && (trans == ChamNoTrans))) {
+    if ( ((side == ChamLeft ) && (trans != ChamNoTrans)) ||
+         ((side == ChamRight) && (trans == ChamNoTrans)) )
+    {
         i1 = 0;
         i3 = IB;
     }
     else {
-        i1 = ((K-1) / IB)*IB;
+        i1 = ( ( K-1 ) / IB )*IB;
         i3 = -IB;
     }
 
-    for(i = i1; (i > -1) && (i < K); i += i3) {
+    for (i = i1; (i > -1) && (i < K); i+=i3) {
         kb = chameleon_min(IB, K-i);
 
         if (side == ChamLeft) {
@@ -138,8 +135,7 @@ int CUDA_zttmqr(
             A2, LDA2,
             V + LDV*i, LDV,
             T + LDT*i, LDT,
-            WORK, LDWORK,
-            WORKC, LDWORKC, CUBLAS_STREAM_VALUE );
+            WORK, LWORK, CUBLAS_STREAM_VALUE );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/cudablas/eztrace_module/cudablas_eztrace_module b/cudablas/eztrace_module/cudablas_eztrace_module
index 7ec370d3f3ad568264dc17c045c1c1f90de71eec..c631193b1fcedd7597a645dad9d208b29c1faed8 100644
--- a/cudablas/eztrace_module/cudablas_eztrace_module
+++ b/cudablas/eztrace_module/cudablas_eztrace_module
@@ -273,6 +273,30 @@ int CUDA_ctsmqr(
             void *WORK,  void* LDWORK,
             void *WORKC, void* LDWORKC,
         void* stream);
+int CUDA_cttmlq(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        void *A1, void* LDA1,
+        void *A2, void* LDA2,
+        void *V, void* LDV,
+        void *T, void* LDT,
+              void *WORK,  void* LDWORK,
+              void *WORKC, void* LDWORKC,
+        void* stream);
+int CUDA_cttmqr(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        void *A1, void* LDA1,
+        void *A2, void* LDA2,
+        void *V, void* LDV,
+        void *T, void* LDT,
+            void *WORK,  void* LDWORK,
+            void *WORKC, void* LDWORKC,
+        void* stream);
 int CUDA_ctsqrt(
         void* m, void* n, void* nb,
         void *da1, void* ldda1,
@@ -528,6 +552,30 @@ int CUDA_dtsmqr(
             double *WORK,  void* LDWORK,
             double *WORKC, void* LDWORKC,
         void* stream);
+int CUDA_dttmlq(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        double *A1, void* LDA1,
+        double *A2, void* LDA2,
+        const double *V, void* LDV,
+        const double *T, void* LDT,
+              double *WORK,  void* LDWORK,
+              double *WORKC, void* LDWORKC,
+        void* stream);
+int CUDA_dttmqr(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        double *A1, void* LDA1,
+        double *A2, void* LDA2,
+        const double *V, void* LDV,
+        const double *T, void* LDT,
+            double *WORK,  void* LDWORK,
+            double *WORKC, void* LDWORKC,
+        void* stream);
 int CUDA_dtsqrt(
         void* m, void* n, void* nb,
         double *da1, void* ldda1,
@@ -783,6 +831,30 @@ int CUDA_stsmqr(
             float *WORK,  void* LDWORK,
             float *WORKC, void* LDWORKC,
         void* stream);
+int CUDA_sttmlq(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        float *A1, void* LDA1,
+        float *A2, void* LDA2,
+        const float *V, void* LDV,
+        const float *T, void* LDT,
+              float *WORK,  void* LDWORK,
+              float *WORKC, void* LDWORKC,
+        void* stream);
+int CUDA_sttmqr(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        float *A1, void* LDA1,
+        float *A2, void* LDA2,
+        const float *V, void* LDV,
+        const float *T, void* LDT,
+            float *WORK,  void* LDWORK,
+            float *WORKC, void* LDWORKC,
+        void* stream);
 int CUDA_stsqrt(
         void* m, void* n, void* nb,
         float *da1, void* ldda1,
@@ -1090,6 +1162,30 @@ int CUDA_ztsmqr(
             void *WORK,  void* LDWORK,
             void *WORKC, void* LDWORKC,
         void* stream);
+int CUDA_zttmlq(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        void *A1, void* LDA1,
+        void *A2, void* LDA2,
+        void *V, void* LDV,
+        void *T, void* LDT,
+              void *WORK,  void* LDWORK,
+              void *WORKC, void* LDWORKC,
+        void* stream);
+int CUDA_zttmqr(
+        void* side, void* trans,
+        void* M1, void* N1,
+        void* M2, void* N2,
+        void* K, void* IB,
+        void *A1, void* LDA1,
+        void *A2, void* LDA2,
+        void *V, void* LDV,
+        void *T, void* LDT,
+            void *WORK,  void* LDWORK,
+            void *WORKC, void* LDWORKC,
+        void* stream);
 int CUDA_ztsqrt(
         void* m, void* n, void* nb,
         void *da1, void* ldda1,
diff --git a/cudablas/include/cudablas/cudablas_z.h b/cudablas/include/cudablas/cudablas_z.h
index 8e96d463ca451c74b17f91c22c2b6c03a27a101e..8895ff6485b33c423bd80b5b99599883eec58ce7 100644
--- a/cudablas/include/cudablas/cudablas_z.h
+++ b/cudablas/include/cudablas/cudablas_z.h
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Florent Pruvost
- * @date 2015-09-16
+ * @date 2018-11-09
  * @precisions normal z -> c d s
  *
  */
@@ -31,16 +31,18 @@ int CUDA_zher2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleCom
 int CUDA_zherfb( cham_uplo_t uplo, int n, int k, int ib, int nb, const cuDoubleComplex *A, int lda, const cuDoubleComplex *T, int ldt, cuDoubleComplex *C, int ldc, cuDoubleComplex *WORK, int ldwork, CUBLAS_STREAM_PARAM );
 int CUDA_zherk(  cham_uplo_t uplo, cham_trans_t trans, int n, int k, double *alpha, const cuDoubleComplex *A, int lda, double *beta, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
 int CUDA_zlarfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M, int N, int K, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *C, int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
-int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_zparfb(cham_side_t side, cham_trans_t trans, cham_dir_t direct, cham_store_t storev, int M1, int N1, int M2, int N2, int K, int L, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zsymm(  cham_side_t side, cham_uplo_t uplo, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_zsyr2k( cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
 int CUDA_zsyrk(  cham_uplo_t uplo, cham_trans_t trans, int n, int k, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, CUBLAS_STREAM_PARAM );
-int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, CUBLAS_STREAM_PARAM );
+int CUDA_ztpmqrt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
+int CUDA_ztpmlqt( cham_side_t side, cham_trans_t trans, int M, int N, int K, int L, int IB, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cuDoubleComplex *WORK, int lwork, CUBLAS_STREAM_PARAM );
 int CUDA_ztrmm(  cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
 int CUDA_ztrsm(  cham_side_t side, cham_uplo_t uplo, cham_trans_t transa, cham_diag_t diag, int m, int n, cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, cuDoubleComplex *B, int ldb, CUBLAS_STREAM_PARAM );
-int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
-int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
-int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LDWORK, cuDoubleComplex *WORKC, int LDWORKC, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zttmlq( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_ztsmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
+int CUDA_zttmqr( cham_side_t side, cham_trans_t trans, int M1, int N1, int M2, int N2, int K, int IB, cuDoubleComplex *A1, int LDA1, cuDoubleComplex *A2, int LDA2, const cuDoubleComplex *V, int LDV, const cuDoubleComplex *T, int LDT, cuDoubleComplex *WORK, int LWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmlqt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 int CUDA_zunmqrt(cham_side_t side, cham_trans_t trans, int M, int N, int K, int IB, const cuDoubleComplex *A,    int LDA, const cuDoubleComplex *T,    int LDT, cuDoubleComplex *C,    int LDC, cuDoubleComplex *WORK, int LDWORK, CUBLAS_STREAM_PARAM );
 
diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index bb1794316b74504f8629d2fb3e9a40ac50d18879..b8f91fb211f98a417a9c722794df6c45c441dcdc 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Cedric Augonnet
- * @date 2011-06-01
+ * @date 2018-11-08
  *
  */
 #ifndef _chameleon_tasks_h_
@@ -54,16 +54,12 @@ typedef enum chameleon_tasktype_e {
   TASK_ORMQR,
   TASK_POTRF,
   TASK_SSSSM,
+  TASK_TPLQT,
+  TASK_TPMLQT,
+  TASK_TPMQRT,
+  TASK_TPQRT,
   TASK_TRTRI,
-  TASK_TSLQT,
-  TASK_TSMLQ,
-  TASK_TSMQR,
-  TASK_TSQRT,
   TASK_TSTRF,
-  TASK_TTLQT,
-  TASK_TTMLQ,
-  TASK_TTMQR,
-  TASK_TTQRT,
   TASK_UNMLQ,
   TASK_UNMQR,
 
@@ -86,6 +82,15 @@ typedef enum chameleon_tasktype_e {
   TASK_NBKERNELS
 } cham_tasktype_t;
 
+#define TASK_TSLQT TASK_TPLQT
+#define TASK_TSMLQ TASK_TPMLQT
+#define TASK_TSMQR TASK_TPMQRT
+#define TASK_TSQRT TASK_TPQRT
+#define TASK_TTLQT TASK_TPLQT
+#define TASK_TTMLQ TASK_TPMLQT
+#define TASK_TTMQR TASK_TPMQRT
+#define TASK_TTQRT TASK_TPQRT
+
 typedef int (*cham_unary_operator_t)( const CHAM_desc_t *desc,
                                       cham_uplo_t uplo, int m, int n,
                                       void *data, void *op_args );
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 28aeaa1b889e0eb5fe16618b02d9bebd73de905a..8265e990686f22639acc23d0919348d017947cb2 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -20,7 +20,7 @@
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-07
  * @precisions normal z -> c d s
  *
  */
@@ -31,435 +31,499 @@
  *  Declarations of QUARK wrappers (called by CHAMELEON) - alphabetical order
  */
 void INSERT_TASK_dzasum( const RUNTIME_option_t *options,
-                        cham_store_t storev, cham_uplo_t uplo, int M, int N,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_store_t storev, cham_uplo_t uplo, int M, int N,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
-                        cham_trans_t trans, int m, int n, int nb,
-                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         cham_trans_t trans, int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo,
-                         int m, int n, int nb,
-                         CHAMELEON_Complex64_t alpha,
-                         const CHAM_desc_t *A, int Am, int An, int lda );
+                          cham_uplo_t uplo,
+                          int m, int n, int nb,
+                          CHAMELEON_Complex64_t alpha,
+                          const CHAM_desc_t *A, int Am, int An, int lda );
 void INSERT_TASK_zbrdalg( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo,
-                         int N, int NB,
-                         const CHAM_desc_t *A,
-                         const CHAM_desc_t *C, int Cm, int Cn,
-                         const CHAM_desc_t *S, int Sm, int Sn,
-                         int i, int j, int m, int grsiz, int BAND,
-                         int *PCOL, int *ACOL, int *MCOL );
+                          cham_uplo_t uplo,
+                          int N, int NB,
+                          const CHAM_desc_t *A,
+                          const CHAM_desc_t *C, int Cm, int Cn,
+                          const CHAM_desc_t *S, int Sm, int Sn,
+                          int i, int j, int m, int grsiz, int BAND,
+                          int *PCOL, int *ACOL, int *MCOL );
 void INSERT_TASK_zgelqt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                         int m, int n, int ib, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt );
 void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
-                       cham_trans_t transA, cham_trans_t transB,
-                       int m, int n, int k, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
-void INSERT_TASK_zgemm2( const RUNTIME_option_t *options,
                         cham_trans_t transA, cham_trans_t transB,
                         int m, int n, int k, int nb,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *B, int Bm, int Bn, int ldb,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+void INSERT_TASK_zgemm2( const RUNTIME_option_t *options,
+                         cham_trans_t transA, cham_trans_t transB,
+                         int m, int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zgemm_f2( const RUNTIME_option_t *options,
-                          cham_trans_t transA, cham_trans_t transB,
-                          int m, int n, int k, int nb,
-                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                          const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                          CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc,
-                          const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
-                          const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
+                           cham_trans_t transA, cham_trans_t transB,
+                           int m, int n, int k, int nb,
+                           CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                           const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                           CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc,
+                           const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
+                           const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
 void INSERT_TASK_zgemm_p2( const RUNTIME_option_t *options,
-                          cham_trans_t transA, cham_trans_t transB,
-                          int m, int n, int k, int nb,
-                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                          const CHAMELEON_Complex64_t **B, int ldb,
-                          CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                           cham_trans_t transA, cham_trans_t transB,
+                           int m, int n, int k, int nb,
+                           CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                           const CHAMELEON_Complex64_t **B, int ldb,
+                           CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zgemm_p2f1( const RUNTIME_option_t *options,
-                            cham_trans_t transA, cham_trans_t transB,
-                            int m, int n, int k, int nb,
-                            CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                            const CHAMELEON_Complex64_t **B, int ldb,
-                            CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc,
-                            const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1 );
+                             cham_trans_t transA, cham_trans_t transB,
+                             int m, int n, int k, int nb,
+                             CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                             const CHAMELEON_Complex64_t **B, int ldb,
+                             CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc,
+                             const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1 );
 void INSERT_TASK_zgemm_p3( const RUNTIME_option_t *options,
-                          cham_trans_t transA, cham_trans_t transB,
-                          int m, int n, int k, int nb,
-                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                          const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                          CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t **C, int ldc );
+                           cham_trans_t transA, cham_trans_t transB,
+                           int m, int n, int k, int nb,
+                           CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                           const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                           CHAMELEON_Complex64_t beta, CHAMELEON_Complex64_t **C, int ldc );
 void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                         int m, int n, int ib, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt );
 void INSERT_TASK_zgessm( const RUNTIME_option_t *options,
-                        int m, int n, int k, int ib, int nb,
-                        int *IPIV,
-                        const CHAM_desc_t *L, int Lm, int Ln, int ldl,
-                        const CHAM_desc_t *D, int Dm, int Dn, int ldd,
-                        const CHAM_desc_t *A, int Am, int An, int lda );
+                         int m, int n, int k, int ib, int nb,
+                         int *IPIV,
+                         const CHAM_desc_t *L, int Lm, int Ln, int ldl,
+                         const CHAM_desc_t *D, int Dm, int Dn, int ldd,
+                         const CHAM_desc_t *A, int Am, int An, int lda );
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                        int m, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
+                         int m, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zgetrf( const RUNTIME_option_t *options,
-                        int m, int n, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        int *IPIV,
-                        cham_bool_t check_info, int iinfo );
+                         int m, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         int *IPIV,
+                         cham_bool_t check_info, int iinfo );
 void INSERT_TASK_zgetrf_incpiv( const RUNTIME_option_t *options,
-                               int m, int n, int ib, int nb,
-                               const CHAM_desc_t *A, int Am, int An, int lda,
-                               const CHAM_desc_t *L, int Lm, int Ln, int ldl,
-                               int *IPIV,
-                               cham_bool_t check_info, int iinfo );
+                                int m, int n, int ib, int nb,
+                                const CHAM_desc_t *A, int Am, int An, int lda,
+                                const CHAM_desc_t *L, int Lm, int Ln, int ldl,
+                                int *IPIV,
+                                cham_bool_t check_info, int iinfo );
 void INSERT_TASK_zgetrf_nopiv( const RUNTIME_option_t *options,
-                              int m, int n, int ib, int nb,
-                              const CHAM_desc_t *A, int Am, int An, int lda, int iinfo );
+                               int m, int n, int ib, int nb,
+                               const CHAM_desc_t *A, int Am, int An, int lda, int iinfo );
 void INSERT_TASK_zgetrf_reclap( const RUNTIME_option_t *options,
-                               int m, int n, int nb,
-                               const CHAM_desc_t *A, int Am, int An, int lda,
-                               int *IPIV,
+                                int m, int n, int nb,
+                                const CHAM_desc_t *A, int Am, int An, int lda,
+                                int *IPIV,
 
-                               cham_bool_t check_info, int iinfo,
-                               int nbthread );
+                                cham_bool_t check_info, int iinfo,
+                                int nbthread );
 void INSERT_TASK_zgetrf_rectil( const RUNTIME_option_t *options,
-                               const CHAM_desc_t A, const CHAM_desc_t *Amn, int Amnm, int Amnn, int size,
-                               int *IPIV,
+                                const CHAM_desc_t A, const CHAM_desc_t *Amn, int Amnm, int Amnn, int size,
+                                int *IPIV,
 
-                               cham_bool_t check_info, int iinfo,
-                               int nbthread );
+                                cham_bool_t check_info, int iinfo,
+                                int nbthread );
 void INSERT_TASK_zgetrip( const RUNTIME_option_t *options,
-                         int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA );
+                          int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA );
 void INSERT_TASK_zgetrip_f1( const RUNTIME_option_t *options,
-                            int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA,
-                            const CHAM_desc_t *fake, int fakem, int faken, int szeF, int paramF );
+                             int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA,
+                             const CHAM_desc_t *fake, int fakem, int faken, int szeF, int paramF );
 void INSERT_TASK_zgetrip_f2( const RUNTIME_option_t *options,
-                            int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA,
-                            const CHAM_desc_t *fake1, int fake1m, int fake1n, int szeF1, int paramF1,
-                            const CHAM_desc_t *fake2, int fake2m, int fake2n, int szeF2, int paramF2 );
+                             int m, int n, const CHAM_desc_t *A, int Am, int An, int szeA,
+                             const CHAM_desc_t *fake1, int fake1m, int fake1n, int szeF1, int paramF1,
+                             const CHAM_desc_t *fake2, int fake2m, int fake2n, int szeF2, int paramF2 );
 void INSERT_TASK_zhe2ge( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo,
-                        int m, int n, int mb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         cham_uplo_t uplo,
+                         int m, int n, int mb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
-                       cham_side_t side, cham_uplo_t uplo,
-                       int m, int n, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                        const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                        CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zhegst( const RUNTIME_option_t *options,
-                        int itype, cham_uplo_t uplo, int N,
-                        const CHAM_desc_t *A, int Am, int An, int LDA,
-                        const CHAM_desc_t *B, int Bm, int Bn, int LDB,
-                        int iinfo );
+                         int itype, cham_uplo_t uplo, int N,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn, int LDB,
+                         int iinfo );
 void INSERT_TASK_zherk( const RUNTIME_option_t *options,
-                       cham_uplo_t uplo, cham_trans_t trans,
-                       int n, int k, int nb,
-                       double alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
-void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
-                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int LDB,
+                        double alpha, const CHAM_desc_t *A, int Am, int An, int lda,
                         double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int LDB,
+                         double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zherfb( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo,
-                        int n, int k, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt,
-                        const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                         cham_uplo_t uplo,
+                         int n, int k, int ib, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                         const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int m, int n, int mb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int ldb );
-void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int m, int n, int mb,
-                         int displA, const CHAM_desc_t *A, int Am, int An, int lda,
-                         int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
+                          cham_uplo_t uplo, int m, int n, int mb,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_zlange( const RUNTIME_option_t *options,
-                        cham_normtype_t norm, int M, int N, int NB,
-                        const CHAM_desc_t *A, int Am, int An, int LDA,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_normtype_t norm, int M, int N, int NB,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zlange_max( const RUNTIME_option_t *options,
-                            const CHAM_desc_t *A, int Am, int An,
-                            const CHAM_desc_t *B, int Bm, int Bn );
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
+                         cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zlanhe( const RUNTIME_option_t *options,
-                        cham_normtype_t norm, cham_uplo_t uplo, int N, int NB,
-                        const CHAM_desc_t *A, int Am, int An, int LDA,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_normtype_t norm, cham_uplo_t uplo, int N, int NB,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zlansy( const RUNTIME_option_t *options,
-                        cham_normtype_t norm, cham_uplo_t uplo, int N, int NB,
-                        const CHAM_desc_t *A, int Am, int An, int LDA,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_normtype_t norm, cham_uplo_t uplo, int N, int NB,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zlantr( const RUNTIME_option_t *options,
-                        cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag,
-                        int M, int N, int NB,
-                        const CHAM_desc_t *A, int Am, int An, int LDA,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_normtype_t norm, cham_uplo_t uplo, cham_diag_t diag,
+                         int M, int N, int NB,
+                         const CHAM_desc_t *A, int Am, int An, int LDA,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
-                        CHAMELEON_Complex64_t beta, const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea );
-void INSERT_TASK_zlaset2( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
-                         const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea );
+                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea );
+void INSERT_TASK_zlaset2( const RUNTIME_option_t *options,
+                          cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
+                          const CHAM_desc_t *tileA, int tileAm, int tileAn, int ldtilea );
 void INSERT_TASK_zlaswp( const RUNTIME_option_t *options,
-                        int n, const CHAM_desc_t *A, int Am, int An, int lda,
-                        int i1,  int i2, int *ipiv, int inc );
+                         int n, const CHAM_desc_t *A, int Am, int An, int lda,
+                         int i1,  int i2, int *ipiv, int inc );
 void INSERT_TASK_zlaswp_f2( const RUNTIME_option_t *options,
-                           int n, const CHAM_desc_t *A, int Am, int An, int lda,
-                           int i1,  int i2, int *ipiv, int inc,
-                           const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
-                           const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
+                            int n, const CHAM_desc_t *A, int Am, int An, int lda,
+                            int i1,  int i2, int *ipiv, int inc,
+                            const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
+                            const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
 void INSERT_TASK_zlaswp_ontile( const RUNTIME_option_t *options,
-                               const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An,
-                               int i1,  int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel );
-void INSERT_TASK_zlaswp_ontile_f2( const RUNTIME_option_t *options,
-                                  const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An,
-                                  int i1,  int i2, int *ipiv, int inc,
-                                  const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
-                                  const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
-void INSERT_TASK_zlaswpc_ontile( const RUNTIME_option_t *options,
                                 const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An,
                                 int i1,  int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel );
+void INSERT_TASK_zlaswp_ontile_f2( const RUNTIME_option_t *options,
+                                   const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An,
+                                   int i1,  int i2, int *ipiv, int inc,
+                                   const CHAM_desc_t *fake1, int fake1m, int fake1n, int szefake1, int flag1,
+                                   const CHAM_desc_t *fake2, int fake2m, int fake2n, int szefake2, int flag2 );
+void INSERT_TASK_zlaswpc_ontile( const RUNTIME_option_t *options,
+                                 const CHAM_desc_t descA, const CHAM_desc_t *A, int Am, int An,
+                                 int i1,  int i2, int *ipiv, int inc, CHAMELEON_Complex64_t *fakepanel );
 void INSERT_TASK_zlatro( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_zlauum( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda );
+                         cham_uplo_t uplo, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda );
 void INSERT_TASK_zplghe( const RUNTIME_option_t *options,
-                        double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
-                        int bigM, int m0, int n0, unsigned long long int seed );
+                         double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
+                         int bigM, int m0, int n0, unsigned long long int seed );
 void INSERT_TASK_zplgsy( const RUNTIME_option_t *options,
-                        CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
-                        int bigM, int m0, int n0, unsigned long long int seed );
+                         CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
+                         int bigM, int m0, int n0, unsigned long long int seed );
 void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
-                        int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
-                        int bigM, int m0, int n0, unsigned long long int seed );
+                         int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
+                         int bigM, int m0, int n0, unsigned long long int seed );
 void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
+                         cham_uplo_t uplo, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
 
-                        int iinfo );
+                         int iinfo );
 void INSERT_TASK_zshift( const RUNTIME_option_t *options,
-                        int s, int m, int n, int L,
-                        CHAMELEON_Complex64_t *A );
+                         int s, int m, int n, int L,
+                         CHAMELEON_Complex64_t *A );
 void INSERT_TASK_zshiftw( const RUNTIME_option_t *options,
-                         int s, int cl, int m, int n, int L,
-                         const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t *W );
+                          int s, int cl, int m, int n, int L,
+                          const CHAM_desc_t *A, int Am, int An, CHAMELEON_Complex64_t *W );
 void INSERT_TASK_zssssm( const RUNTIME_option_t *options,
-                        int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *L1, int L1m, int L1n, int ldl1,
-                        const CHAM_desc_t *L2, int L2m, int L2n, int ldl2,
-                        const int *IPIV );
+                         int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                         const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                         const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                         const CHAM_desc_t *L1, int L1m, int L1n, int ldl1,
+                         const CHAM_desc_t *L2, int L2m, int L2n, int ldl2,
+                         const int *IPIV );
 void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
-                       cham_side_t side, cham_uplo_t uplo,
-                       int m, int n, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                        const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                        CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
-                       cham_uplo_t uplo, cham_trans_t trans,
-                       int n, int k, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
-void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int LDB,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int LDB,
+                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
+                         cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options,
-                              cham_uplo_t uplo, int n, int nb,
-                              const CHAM_desc_t *A, int Am, int An, int lda,
-                              int iinfo );
+                               cham_uplo_t uplo, int n, int nb,
+                               const CHAM_desc_t *A, int Am, int An, int lda,
+                               int iinfo );
 void INSERT_TASK_zswpab( const RUNTIME_option_t *options,
-                        int i, int n1, int n2,
-                        const CHAM_desc_t *A, int Am, int An, int szeA );
+                         int i, int n1, int n2,
+                         const CHAM_desc_t *A, int Am, int An, int szeA );
 void INSERT_TASK_zswptr_ontile( const RUNTIME_option_t *options,
-                               const CHAM_desc_t descA, const CHAM_desc_t *Aij, int Aijm, int Aijn,
-                               int i1,  int i2, int *ipiv, int inc,
-                               const CHAM_desc_t *Akk, int Akkm, int Akkn, int ldak );
+                                const CHAM_desc_t descA, const CHAM_desc_t *Aij, int Aijm, int Aijn,
+                                int i1,  int i2, int *ipiv, int inc,
+                                const CHAM_desc_t *Akk, int Akkm, int Akkn, int ldak );
 void INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
-                        int m, int n, int l, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
-                         cham_side_t side, cham_trans_t trans,
-                         int M, int N, int K, int L, int ib, int nb,
-                         const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                         const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                         int m, int n, int l, int ib, int nb,
                          const CHAM_desc_t *A, int Am, int An, int lda,
-                         const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
+                          cham_side_t side, cham_trans_t trans,
+                          int M, int N, int K, int L, int ib, int nb,
+                          const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                          const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                          const CHAM_desc_t *A, int Am, int An, int lda,
+                          const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
-                         cham_side_t side, cham_trans_t trans,
-                         int m, int n, int k, int l, int ib, int nb,
-                         const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                         const CHAM_desc_t *T, int Tm, int Tn, int ldt,
-                         const CHAM_desc_t *A, int Am, int An, int lda,
-                         const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                          cham_side_t side, cham_trans_t trans,
+                          int m, int n, int k, int l, int ib, int nb,
+                          const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                          const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                          const CHAM_desc_t *A, int Am, int An, int lda,
+                          const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
-                        int m, int n, int l, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn, int ldb,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                         int m, int n, int l, int ib, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn, int ldb,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt );
 void INSERT_TASK_ztrdalg( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo,
-                         int N, int NB,
-                         const CHAM_desc_t *A,
-                         const CHAM_desc_t *C, int Cm, int Cn,
-                         const CHAM_desc_t *S, int Sm, int Sn,
-                         int i, int j, int m, int grsiz, int BAND,
-                         int *PCOL, int *ACOL, int *MCOL );
+                          cham_uplo_t uplo,
+                          int N, int NB,
+                          const CHAM_desc_t *A,
+                          const CHAM_desc_t *C, int Cm, int Cn,
+                          const CHAM_desc_t *S, int Sm, int Sn,
+                          int i, int j, int m, int grsiz, int BAND,
+                          int *PCOL, int *ACOL, int *MCOL );
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
-                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                         cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_ztrasm( const RUNTIME_option_t *options,
-                        cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *B, int Bm, int Bn );
+                         cham_store_t storev, cham_uplo_t uplo, cham_diag_t diag, int M, int N,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
-                       cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
-                       int m, int n, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                        cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                        const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_ztrmm_p2( const RUNTIME_option_t *options,
-                          cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
-                          int m, int n, int nb,
-                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                          CHAMELEON_Complex64_t **B, int ldb );
+                           cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                           int m, int n, int nb,
+                           CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                           CHAMELEON_Complex64_t **B, int ldb );
 void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
-                       cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
-                       int m, int n, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb );
+                        cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
+                        const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_ztrssq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, cham_diag_t diag,
-                        int m, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
+                         cham_uplo_t uplo, cham_diag_t diag,
+                         int m, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_ztrtri( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, cham_diag_t diag, int n, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
+                         cham_uplo_t uplo, cham_diag_t diag, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
 
-                        int iinfo );
-void INSERT_TASK_ztslqt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_ztsmlq( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                         int iinfo );
 void INSERT_TASK_ztsmlq_hetra1( const RUNTIME_option_t *options,
-                               cham_side_t side, cham_trans_t trans,
-                               int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                               const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                               const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                               const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                               const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_ztsmqr( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                                cham_side_t side, cham_trans_t trans,
+                                int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                                const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                                const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                                const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                                const CHAM_desc_t *T, int Tm, int Tn, int ldt );
 void INSERT_TASK_ztsmqr_hetra1( const RUNTIME_option_t *options,
-                               cham_side_t side, cham_trans_t trans,
-                               int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                               const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                               const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                               const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                               const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_ztsqrt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
+                                cham_side_t side, cham_trans_t trans,
+                                int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                                const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                                const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                                const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                                const CHAM_desc_t *T, int Tm, int Tn, int ldt );
 void INSERT_TASK_ztstrf( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *U, int Um, int Un, int ldu,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *L, int Lm, int Ln, int ldl,
-                        int *IPIV,
-                        cham_bool_t check_info, int iinfo );
-void INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_zttqrt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_zttmlq( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                         int m, int n, int ib, int nb,
+                         const CHAM_desc_t *U, int Um, int Un, int ldu,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *L, int Lm, int Ln, int ldl,
+                         int *IPIV,
+                         cham_bool_t check_info, int iinfo );
+void INSERT_TASK_zpamm( const RUNTIME_option_t *options,
+                        int op, cham_side_t side, cham_store_t storev,
+                        int m, int n, int k, int l,
                         const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
                         const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
                         const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_zttlqt( const RUNTIME_option_t *options,
-                        int m, int n, int ib, int nb,
-                        const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                        const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt );
-void INSERT_TASK_zpamm( const RUNTIME_option_t *options,
-                       int op, cham_side_t side, cham_store_t storev,
-                       int m, int n, int k, int l,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *W, int Wm, int Wn, int ldw );
+                        const CHAM_desc_t *W, int Wm, int Wn, int ldw );
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
-                        const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn );
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
+                         const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn );
 void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
-                         const CHAM_desc_t *RESULT, int RESULTm, int RESULTn );
+                          const CHAM_desc_t *RESULT, int RESULTm, int RESULTn );
 void INSERT_TASK_zunmlq( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m, int n, int ib,  int nb, int k,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt,
-                        const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                         cham_side_t side, cham_trans_t trans,
+                         int m, int n, int ib,  int nb, int k,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                         const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zunmqr( const RUNTIME_option_t *options,
-                        cham_side_t side, cham_trans_t trans,
-                        int m, int n, int k, int ib, int nb,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *T, int Tm, int Tn, int ldt,
-                        const CHAM_desc_t *C, int Cm, int Cn, int ldc );
+                         cham_side_t side, cham_trans_t trans,
+                         int m, int n, int k, int ib, int nb,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                         const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zbuild( const RUNTIME_option_t *options,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        void *user_data, void* user_build_callback );
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         void *user_data, void* user_build_callback );
+
+
+/**
+ * Keep these insert_task for retro-compatibility
+ */
+static inline void
+INSERT_TASK_ztslqt( const RUNTIME_option_t *options,
+                    int m, int n, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztplqt( options, m, n, 0, ib, nb,
+                               A1, A1m, A1n, lda1,
+                               A2, A2m, A2n, lda2,
+                               T,  Tm,  Tn,  ldt );
+}
+
+static inline void
+INSERT_TASK_ztsqrt( const RUNTIME_option_t *options,
+                    int m, int n, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpqrt( options, m, n, 0, ib, nb,
+                               A1, A1m, A1n, lda1,
+                               A2, A2m, A2n, lda2,
+                               T,  Tm,  Tn,  ldt );
+}
+
+static inline void
+INSERT_TASK_zttlqt( const RUNTIME_option_t *options,
+                    int m, int n, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztplqt( options, m, n, n, ib, nb,
+                               A1, A1m, A1n, lda1,
+                               A2, A2m, A2n, lda2,
+                               T,  Tm,  Tn,  ldt );
+}
+
+static inline void
+INSERT_TASK_zttqrt( const RUNTIME_option_t *options,
+                    int m, int n, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpqrt( options, m, n, m, ib, nb,
+                               A1, A1m, A1n, lda1,
+                               A2, A2m, A2n, lda2,
+                               T,  Tm,  Tn,  ldt );
+}
+
+static inline void
+INSERT_TASK_ztsmlq( const RUNTIME_option_t *options,
+                    cham_side_t side, cham_trans_t trans,
+                    int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, 0, ib, nb,
+                                V, Vm, Vn, ldv, T, Tm, Tn, ldt,
+                                A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
+}
+
+static inline void
+INSERT_TASK_ztsmqr( const RUNTIME_option_t *options,
+                    cham_side_t side, cham_trans_t trans,
+                    int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, 0, ib, nb,
+                                V, Vm, Vn, ldv, T, Tm, Tn, ldt,
+                                A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
+}
+
+static inline void
+INSERT_TASK_zttmlq( const RUNTIME_option_t *options,
+                    cham_side_t side, cham_trans_t trans,
+                    int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpmlqt( options, side, trans, m2, n2, k, n2, ib, nb,
+                                V, Vm, Vn, ldv, T, Tm, Tn, ldt,
+                                A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
+}
+
+static inline void
+INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
+                    cham_side_t side, cham_trans_t trans,
+                    int m1, int n1, int m2, int n2, int k, int ib, int nb,
+                    const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
+                    const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
+                    const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                    const CHAM_desc_t *T, int Tm, int Tn, int ldt )
+{
+    return INSERT_TASK_ztpmqrt( options, side, trans, m2, n2, k, m2, ib, nb,
+                                V, Vm, Vn, ldv, T, Tm, Tn, ldt,
+                                A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
+}
 
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index f14ef4838888f5481b52774b5b63116825fc00d8..73503ee0ce89aff056eb5971f4904bdbe0787315 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -21,7 +21,7 @@
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
-#  @date 2012-07-13
+#  @date 2018-11-07
 #
 ###
 
@@ -86,17 +86,9 @@ set(CODELETS_ZSRC
     codelets/codelet_ztrasm.c
     codelets/codelet_ztrssq.c
     codelets/codelet_ztrtri.c
-    codelets/codelet_ztslqt.c
-    codelets/codelet_ztsmlq.c
-    codelets/codelet_ztsmqr.c
     codelets/codelet_ztsmlq_hetra1.c
     codelets/codelet_ztsmqr_hetra1.c
-    codelets/codelet_ztsqrt.c
     codelets/codelet_ztstrf.c
-    codelets/codelet_zttlqt.c
-    codelets/codelet_zttmlq.c
-    codelets/codelet_zttmqr.c
-    codelets/codelet_zttqrt.c
     codelets/codelet_zunmlq.c
     codelets/codelet_zunmqr.c
     ##################
diff --git a/runtime/parsec/codelets/codelet_ztslqt.c b/runtime/parsec/codelets/codelet_ztslqt.c
deleted file mode 100644
index 89c8721131a948e63657ebbd08eea7c553dca5c2..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_ztslqt.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- *
- * @file parsec/codelet_ztslqt.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztslqt PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_ztslqt_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    parsec_dtd_unpack_args(
-        this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK );
-
-    CORE_ztslqt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK );
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_ztslqt_parsec, options->priority, "tslqt",
-        sizeof(int),            &m,                     VALUE,
-        sizeof(int),            &n,                     VALUE,
-        sizeof(int),            &ib,                    VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),            &lda1,                  VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),            &lda2,                  VALUE,
-        PASSED_BY_REF,          RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT,
-        sizeof(int),                        &ldt,       VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,       SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,       SCRATCH,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_ztsmlq.c b/runtime/parsec/codelets/codelet_ztsmlq.c
deleted file mode 100644
index 56b86887aec4bcf6c3ad6685d6f027a40b188491..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_ztsmlq.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- *
- * @file parsec/codelet_ztsmlq.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmlq PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_ztsmlq_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    parsec_dtd_unpack_args(
-        this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork );
-
-    CORE_ztsmlq( side, trans, m1, n1, m2, n2, k, ib,
-                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_ztsmlq_parsec, options->priority, "tsmlq",
-        sizeof(int),                &side,       VALUE,
-        sizeof(int),                &trans,      VALUE,
-        sizeof(int),                        &m1,        VALUE,
-        sizeof(int),                        &n1,        VALUE,
-        sizeof(int),                        &m2,        VALUE,
-        sizeof(int),                        &n2,        VALUE,
-        sizeof(int),                        &k,         VALUE,
-        sizeof(int),                        &ib,        VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),           &lda1,                   VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),           &lda2,                   VALUE,
-        PASSED_BY_REF,         RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT,
-        sizeof(int),           &ldv,                    VALUE,
-        PASSED_BY_REF,         RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT,
-        sizeof(int),           &ldt,                    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,       SCRATCH,
-        sizeof(int),           &ldwork,                 VALUE,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_ztsmqr.c b/runtime/parsec/codelets/codelet_ztsmqr.c
deleted file mode 100644
index e8059bde8466f5957cd435dbc0e1a7b3b55714eb..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_ztsmqr.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- *
- * @file parsec/codelet_ztsmqr.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmqr PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_ztsmqr_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    parsec_dtd_unpack_args(
-        this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork );
-
-    CORE_ztsmqr( side, trans, m1, n1, m2, n2, k, ib,
-                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_ztsmqr_parsec, options->priority, "tsmqr",
-        sizeof(int),    &side,                              VALUE,
-        sizeof(int),    &trans,                             VALUE,
-        sizeof(int),           &m1,                                VALUE,
-        sizeof(int),           &n1,                                VALUE,
-        sizeof(int),           &m2,                                VALUE,
-        sizeof(int),           &n2,                                VALUE,
-        sizeof(int),           &k,                                 VALUE,
-        sizeof(int),           &ib,                                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),           &lda1,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),           &lda2,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT,
-        sizeof(int),           &ldv,                               VALUE,
-        PASSED_BY_REF,         RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT,
-        sizeof(int),           &ldt,                               VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,                  SCRATCH,
-        sizeof(int),           &ldwork,                            VALUE,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_ztsqrt.c b/runtime/parsec/codelets/codelet_ztsqrt.c
deleted file mode 100644
index a8edb3c0fdf0562f9c374bdb2100919d6f825b97..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_ztsqrt.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- *
- * @file parsec/codelet_ztsqrt.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsqrt PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_ztsqrt_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    parsec_dtd_unpack_args(
-        this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK );
-
-    CORE_ztsqrt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK );
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_ztsqrt_parsec, options->priority, "tsqrt",
-        sizeof(int),    &m,                                 VALUE,
-        sizeof(int),           &n,                                 VALUE,
-        sizeof(int),           &ib,                                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),           &lda1,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),           &lda2,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT,
-        sizeof(int),           &ldt,                               VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,                  SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,                  SCRATCH,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_zttlqt.c b/runtime/parsec/codelets/codelet_zttlqt.c
deleted file mode 100644
index 1a72dd5cffefc3c4cc929558042fa10d5343aa33..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_zttlqt.c
+++ /dev/null
@@ -1,71 +0,0 @@
-/**
- *
- * @file parsec/codelet_zttlqt.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttlqt PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_zttlqt_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    parsec_dtd_unpack_args(
-        this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK );
-
-    CORE_zttlqt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK );
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zttlqt_parsec, options->priority, "ttlqt",
-        sizeof(int),            &m,                     VALUE,
-        sizeof(int),            &n,                     VALUE,
-        sizeof(int),            &ib,                    VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),            &lda1,                  VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),            &lda2,                  VALUE,
-        PASSED_BY_REF,          RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT,
-        sizeof(int),            &ldt,                   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,       SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,       SCRATCH,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_zttmlq.c b/runtime/parsec/codelets/codelet_zttmlq.c
deleted file mode 100644
index b0788876c79e5197cbc30ffa4880b77a68531ff5..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_zttmlq.c
+++ /dev/null
@@ -1,89 +0,0 @@
-/**
- *
- * @file parsec/codelet_zttmlq.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmlq PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_zttmlq_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    parsec_dtd_unpack_args(
-        this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork );
-
-    CORE_zttmlq( side, trans, m1, n1, m2, n2, k, ib, A1, lda1,
-                 A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zttmlq_parsec, options->priority, "ttmlq",
-        sizeof(int),     &side,                      VALUE,
-        sizeof(int),     &trans,                     VALUE,
-        sizeof(int),            &m1,                        VALUE,
-        sizeof(int),            &n1,                        VALUE,
-        sizeof(int),            &m2,                        VALUE,
-        sizeof(int),            &n2,                        VALUE,
-        sizeof(int),            &k,                         VALUE,
-        sizeof(int),            &ib,                        VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),            &lda1,                      VALUE,
-        PASSED_BY_REF,          RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),            &lda2,                      VALUE,
-        PASSED_BY_REF,          RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT,
-        sizeof(int),            &ldv,                       VALUE,
-        PASSED_BY_REF,          RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT,
-        sizeof(int),            &ldt,                       VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,           SCRATCH,
-        sizeof(int),            &ldwork,                    VALUE,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_zttmqr.c b/runtime/parsec/codelets/codelet_zttmqr.c
deleted file mode 100644
index f8a8b8f6bc9fb415323554d7eb8efea1dab0725d..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_zttmqr.c
+++ /dev/null
@@ -1,90 +0,0 @@
-/**
- *
- * @file parsec/codelet_zttmqr.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmqr PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_zttmqr_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    parsec_dtd_unpack_args(
-        this_task, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib, &A1, &lda1, &A2, &lda2, &V, &ldv, &T, &ldt, &WORK, &ldwork );
-
-    CORE_zttmqr( side, trans, m1, n1, m2, n2, k, ib,
-                 A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-
-void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zttmqr_parsec, options->priority, "ttmqr",
-        sizeof(int),    &side,                             VALUE,
-        sizeof(int),    &trans,                            VALUE,
-        sizeof(int),           &m1,                               VALUE,
-        sizeof(int),           &n1,                               VALUE,
-        sizeof(int),           &m2,                               VALUE,
-        sizeof(int),           &n2,                               VALUE,
-        sizeof(int),           &k,                                VALUE,
-        sizeof(int),           &ib,                               VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),           &lda1,                             VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),           &lda2,                             VALUE,
-        PASSED_BY_REF,         RTBLKADDR( V, CHAMELEON_Complex64_t, Vm, Vn ), chameleon_parsec_get_arena_index( V ) | INPUT,
-        sizeof(int),           &ldv,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | INPUT,
-        sizeof(int),           &ldt,                              VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,                            SCRATCH,
-        sizeof(int),           &ldwork,                           VALUE,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/parsec/codelets/codelet_zttqrt.c b/runtime/parsec/codelets/codelet_zttqrt.c
deleted file mode 100644
index 6b22180076622b164425b06089774675fc873cfb..0000000000000000000000000000000000000000
--- a/runtime/parsec/codelets/codelet_zttqrt.c
+++ /dev/null
@@ -1,70 +0,0 @@
-/**
- *
- * @file parsec/codelet_zttqrt.c
- *
- * @copyright 2009-2015 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttqrt PaRSEC codelet
- *
- * @version 1.0.0
- * @author Reazul Hoque
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_parsec.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_zttqrt_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    parsec_dtd_unpack_args(
-        this_task, &m, &n, &ib, &A1, &lda1, &A2, &lda2, &T, &ldt, &TAU, &WORK );
-
-    CORE_zttqrt( m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK );
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
-
-void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zttqrt_parsec, options->priority,  "ttqrt",
-        sizeof(int),    &m,                                 VALUE,
-        sizeof(int),           &n,                                 VALUE,
-        sizeof(int),           &ib,                                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A1, CHAMELEON_Complex64_t, A1m, A1n ), chameleon_parsec_get_arena_index( A1 ) | INOUT,
-        sizeof(int),           &lda1,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A2, CHAMELEON_Complex64_t, A2m, A2n ), chameleon_parsec_get_arena_index( A2 ) | INOUT | AFFINITY,
-        sizeof(int),           &lda2,                              VALUE,
-        PASSED_BY_REF,         RTBLKADDR( T, CHAMELEON_Complex64_t, Tm, Tn ), chameleon_parsec_get_arena_index( T ) | OUTPUT,
-        sizeof(int),           &ldt,                               VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,                  SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,                  SCRATCH,
-        PARSEC_DTD_ARG_END );
-}
diff --git a/runtime/quark/codelets/codelet_ztplqt.c b/runtime/quark/codelets/codelet_ztplqt.c
index 5b3f15ab4527f3d753d3e74d1d65932f6ce84e34..f0e51b3754d6460ea9d3be4a8e9d4f826c8a997f 100644
--- a/runtime/quark/codelets/codelet_ztplqt.c
+++ b/runtime/quark/codelets/codelet_ztplqt.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -50,7 +50,7 @@ void INSERT_TASK_ztplqt( const RUNTIME_option_t *options,
                          const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSLQT;
+    DAG_CORE_TPLQT;
 
     int shapeB = ( L == 0 ) ? 0 : (QUARK_REGION_L | QUARK_REGION_D);
 
diff --git a/runtime/quark/codelets/codelet_ztpmlqt.c b/runtime/quark/codelets/codelet_ztpmlqt.c
index e82f40c4dbe58525926f2381eef3cea39fb6ccdc..fa435550dbb7cea349898f43b5e8fa9ec71c8ef4 100644
--- a/runtime/quark/codelets/codelet_ztpmlqt.c
+++ b/runtime/quark/codelets/codelet_ztpmlqt.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -57,7 +57,7 @@ void INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
                          const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSMQR;
+    DAG_CORE_TPMLQT;
 
     int shapeV = ( L == 0 ) ? 0 : (QUARK_REGION_L | QUARK_REGION_D);
 
diff --git a/runtime/quark/codelets/codelet_ztpmqrt.c b/runtime/quark/codelets/codelet_ztpmqrt.c
index 933ca3f1327d999f671f786002e6f559eb0130f8..bdf6627273065b4bdf18d93f07384628d5b113f7 100644
--- a/runtime/quark/codelets/codelet_ztpmqrt.c
+++ b/runtime/quark/codelets/codelet_ztpmqrt.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -57,7 +57,7 @@ void INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
                          const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSMQR;
+    DAG_CORE_TPMQRT;
 
     int shapeV = ( L == 0 ) ? 0 : (QUARK_REGION_U | QUARK_REGION_D);
 
diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c
index 50470ac8e806aed4479679a577629091f573b4cd..24ce98e124023f90184379c211200d21693ed503 100644
--- a/runtime/quark/codelets/codelet_ztpqrt.c
+++ b/runtime/quark/codelets/codelet_ztpqrt.c
@@ -13,7 +13,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -50,7 +50,7 @@ void INSERT_TASK_ztpqrt( const RUNTIME_option_t *options,
                          const CHAM_desc_t *T, int Tm, int Tn, int ldt )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSQRT;
+    DAG_CORE_TPQRT;
 
     int shapeB = ( L == 0 ) ? 0 : (QUARK_REGION_U | QUARK_REGION_D);
 
diff --git a/runtime/quark/codelets/codelet_ztslqt.c b/runtime/quark/codelets/codelet_ztslqt.c
deleted file mode 100644
index 4efb19be91de19cb3819cf674e4a08a9f1defaaf..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_ztslqt.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- *
- * @file quark/codelet_ztslqt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztslqt Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_ztslqt_quark(Quark *quark)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-    CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztslqt computes a LQ factorization of a rectangular matrix
- *  formed by coupling side-by-side a complex M-by-M
- *  lower triangular tile A1 and a complex M-by-N tile A2:
- *
- *    | A1 A2 | = L * Q
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)', where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
- *  A2(i,1:n), and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A1 and A2. M >= 0.
- *         The number of columns of the tile A1.
- *
- * @param[in] N
- *         The number of columns of the tile A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M-by-M tile A1.
- *         On exit, the elements on and below the diagonal of the array
- *         contain the M-by-M lower trapezoidal tile L;
- *         the elements above the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N tile A2.
- *         On exit, all the elements with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSLQT;
-    QUARK_Insert_Task(opt->quark, CORE_ztslqt_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),                        &m,     VALUE,
-        sizeof(int),                        &n,     VALUE,
-        sizeof(int),                        &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb,    RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_L | QUARK_REGION_D,
-        sizeof(int),                        &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb,    RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                        &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    OUTPUT,
-        sizeof(int),                        &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,          SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,          SCRATCH,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_ztsmlq.c b/runtime/quark/codelets/codelet_ztsmlq.c
deleted file mode 100644
index b3003d130f6c08c8e5b60130b13b1baf5653ed04..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_ztsmlq.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- *
- * @file quark/codelet_ztsmlq.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmlq Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Azzam Haidar
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_ztsmlq_quark(Quark *quark)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib,
-                         A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-    CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztsmlq overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |     | A1 A2 | * Q
- *                             | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |     | A1 A2 | * Q**H
- *                             | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)'
- *
- *  as returned by CORE_ZTSLQT.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *         M2 = M1 if side == ChamRight.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *         N2 = N1 if side == ChamLeft.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTSLQT in the first k rows of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size
- *             LDWORK-by-M1 if side == ChamLeft
- *             LDWORK-by-IB if side == ChamRight
- *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK.
- *             LDWORK >= max(1,IB) if side == ChamLeft
- *             LDWORK >= max(1,N1) if side == ChamRight
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSMLQ;
-    QUARK_Insert_Task(opt->quark, CORE_ztsmlq_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &side,  VALUE,
-        sizeof(int),              &trans, VALUE,
-        sizeof(int),                     &m1,    VALUE,
-        sizeof(int),                     &n1,    VALUE,
-        sizeof(int),                     &m2,    VALUE,
-        sizeof(int),                     &n2,    VALUE,
-        sizeof(int),                     &k,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),    INPUT,
-        sizeof(int),                     &ldv,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    INPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        sizeof(int),                     &ldwork, VALUE,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_ztsmqr.c b/runtime/quark/codelets/codelet_ztsmqr.c
deleted file mode 100644
index afcde5dfa02d9f67e8363f57698b47bd256ea59c..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_ztsmqr.c
+++ /dev/null
@@ -1,190 +0,0 @@
-/**
- *
- * @file quark/codelet_ztsmqr.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmqr Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Azzam Haidar
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_ztsmqr_quark(Quark *quark)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib,
-                         A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-    CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztsmqr overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |     | A1 A2 | * Q
- *                             | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |     | A1 A2 | * Q**H
- *                             | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_ZTSQRT.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *         M2 = M1 if side == ChamRight.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *         N2 = N1 if side == ChamLeft.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTSQRT in the first k columns of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size
- *             LDWORK-by-N1 if side == ChamLeft
- *             LDWORK-by-IB if side == ChamRight
- *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK.
- *             LDWORK >= max(1,IB) if side == ChamLeft
- *             LDWORK >= max(1,M1) if side == ChamRight
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSMQR;
-    QUARK_Insert_Task(opt->quark, CORE_ztsmqr_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &side,  VALUE,
-        sizeof(int),              &trans, VALUE,
-        sizeof(int),                     &m1,    VALUE,
-        sizeof(int),                     &n1,    VALUE,
-        sizeof(int),                     &m2,    VALUE,
-        sizeof(int),                     &n2,    VALUE,
-        sizeof(int),                     &k,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),    INPUT,
-        sizeof(int),                     &ldv,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    INPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        sizeof(int),                     &ldwork, VALUE,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_ztsqrt.c b/runtime/quark/codelets/codelet_ztsqrt.c
deleted file mode 100644
index 44457debb9112a93fd00ad3a6f6585d7f9d49ef0..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_ztsqrt.c
+++ /dev/null
@@ -1,131 +0,0 @@
-/**
- *
- * @file quark/codelet_ztsqrt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsqrt Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_ztsqrt_quark(Quark *quark)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-    CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * CORE_ztsqrt computes a QR factorization of a rectangular matrix
- * formed by coupling a complex N-by-N upper triangular tile A1
- * on top of a complex M-by-N tile A2:
- *
- *    | A1 | = Q * R
- *    | A2 |
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of columns of the tile A2. M >= 0.
- *
- * @param[in] N
- *         The number of rows of the tile A1.
- *         The number of columns of the tiles A1 and A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the N-by-N tile A1.
- *         On exit, the elements on and above the diagonal of the array
- *         contain the N-by-N upper trapezoidal tile R;
- *         the elements below the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N tile A2.
- *         On exit, all the elements with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TSQRT;
-    QUARK_Insert_Task(opt->quark, CORE_ztsqrt_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),                        &m,     VALUE,
-        sizeof(int),                        &n,     VALUE,
-        sizeof(int),                        &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb,    RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_U | QUARK_REGION_D,
-        sizeof(int),                        &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb,    RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                        &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    OUTPUT,
-        sizeof(int),                        &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,       NULL,          SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb,    NULL,          SCRATCH,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_zttlqt.c b/runtime/quark/codelets/codelet_zttlqt.c
deleted file mode 100644
index 85eb8e3d80859879501a3592eff0ec4628c88e13..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_zttlqt.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- *
- * @file quark/codelet_zttlqt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttlqt Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_zttlqt_quark(Quark *quark)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-    CORE_zttlqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttlqt computes a LQ factorization of a rectangular matrix
- *  formed by coupling side-by-side a complex M-by-M lower triangular tile A1
- *  and a complex M-by-N lower triangular tile A2:
- *
- *    | A1 A2 | = L * Q
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)', where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
- *  A2(i,1:n), and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A1 and A2. M >= 0.
- *         The number of columns of the tile A1.
- *
- * @param[in] N
- *         The number of columns of the tile A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M-by-M tile A1.
- *         On exit, the elements on and below the diagonal of the array
- *         contain the M-by-M lower trapezoidal tile L;
- *         the elements above the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1.  LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N lower triangular tile A2.
- *         On exit, the elements on and below the diagonal of the array
- *         with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the array A2.  LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[in,out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TTLQT;
-    QUARK_Insert_Task(opt->quark, CORE_zttlqt_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),                     &m,     VALUE,
-        sizeof(int),                     &n,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_L | QUARK_REGION_D,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | QUARK_REGION_L | QUARK_REGION_D | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    OUTPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,    NULL,          SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_zttmlq.c b/runtime/quark/codelets/codelet_zttmlq.c
deleted file mode 100644
index f3701869ca32a8a1376a8cf04d500eb8049a36f0..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_zttmlq.c
+++ /dev/null
@@ -1,182 +0,0 @@
-/**
- *
- * @file quark/codelet_zttmlq.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmlq Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_zttmlq_quark(Quark *quark)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib,
-                         A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-    CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1,
-                A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttmlq overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 (N1 == N2) with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |       | A1 | * Q
- *                             | A2 |       | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |       | A1 | * Q**H
- *                             | A2 |       | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_zttqrt.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTTQRT in the first k rows of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size LDWORK-by-N1.
- *
- * @param[in] LDWORK
- *         The dimension of the array WORK. LDWORK >= max(1,IB).
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TTMLQ;
-    QUARK_Insert_Task(opt->quark, CORE_zttmlq_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &side,  VALUE,
-        sizeof(int),              &trans, VALUE,
-        sizeof(int),                     &m1,    VALUE,
-        sizeof(int),                     &n1,    VALUE,
-        sizeof(int),                     &m2,    VALUE,
-        sizeof(int),                     &n2,    VALUE,
-        sizeof(int),                     &k,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),    INPUT | QUARK_REGION_L | QUARK_REGION_D,
-        sizeof(int),                     &ldv,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    INPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        sizeof(int),                     &ldwork,    VALUE,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_zttmqr.c b/runtime/quark/codelets/codelet_zttmqr.c
deleted file mode 100644
index e106a34ce42948c3786f10f3af9930368299a781..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_zttmqr.c
+++ /dev/null
@@ -1,183 +0,0 @@
-/**
- *
- * @file quark/codelet_zttmqr.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmqr Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static void
-CORE_zttmqr_quark( Quark *quark )
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    quark_unpack_args_18(quark, side, trans, m1, n1, m2, n2, k, ib,
-                         A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-    CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttmqr overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 (N1 == N2) with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |       | A1 | * Q
- *                             | A2 |       | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |       | A1 | * Q**H
- *                             | A2 |       | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_zttqrt.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTTQRT in the first k rows of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size LDWORK-by-N1.
- *
- * @param[in] LDWORK
- *         The dimension of the array WORK. LDWORK >= max(1,IB).
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TTMQR;
-    QUARK_Insert_Task(opt->quark, CORE_zttmqr_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &side,  VALUE,
-        sizeof(int),              &trans, VALUE,
-        sizeof(int),                     &m1,    VALUE,
-        sizeof(int),                     &n1,    VALUE,
-        sizeof(int),                     &m2,    VALUE,
-        sizeof(int),                     &n2,    VALUE,
-        sizeof(int),                     &k,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),    INPUT | QUARK_REGION_U | QUARK_REGION_D,
-        sizeof(int),                     &ldv,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    INPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        sizeof(int),                     &ldwork, VALUE,
-        0);
-}
diff --git a/runtime/quark/codelets/codelet_zttqrt.c b/runtime/quark/codelets/codelet_zttqrt.c
deleted file mode 100644
index d5f62c44110d6d8f39baecfd9ca2ba75c611f0b1..0000000000000000000000000000000000000000
--- a/runtime/quark/codelets/codelet_zttqrt.c
+++ /dev/null
@@ -1,143 +0,0 @@
-/**
- *
- * @file quark/codelet_zttqrt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttqrt Quark codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_quark.h"
-#include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_zttqrt_quark(Quark *quark)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    quark_unpack_args_11(quark, m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-    CORE_zttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttqrt computes a QR factorization of a rectangular matrix
- *  formed by coupling a complex N-by-N upper triangular tile A1
- *  on top of a complex M-by-N upper trapezoidal tile A2:
- *
- *    | A1 | = Q * R
- *    | A2 |
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k), where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A2(1:m,i),
- *  and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A2.  M >= 0.
- *
- * @param[in] N
- *         The number of columns of the tile A1 and A2.  N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the N-by-N tile A1.
- *         On exit, the elements on and above the diagonal of the array
- *         contain the N-by-N upper trapezoidal tile R;
- *         the elements below the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1.  LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N upper triangular tile A2.
- *         On exit, the elements on and above the diagonal of the array
- *         with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the array A2.  LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[in,out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    DAG_CORE_TTQRT;
-    QUARK_Insert_Task(opt->quark, CORE_zttqrt_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),                     &m,     VALUE,
-        sizeof(int),                     &n,     VALUE,
-        sizeof(int),                     &ib,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n), INOUT | QUARK_REGION_U | QUARK_REGION_D,
-        sizeof(int),                     &lda1,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb*nb, RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n), INOUT | QUARK_REGION_U | QUARK_REGION_D | LOCALITY,
-        sizeof(int),                     &lda2,  VALUE,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),    OUTPUT,
-        sizeof(int),                     &ldt,   VALUE,
-        sizeof(CHAMELEON_Complex64_t)*nb,    NULL,          SCRATCH,
-        sizeof(CHAMELEON_Complex64_t)*ib*nb, NULL,          SCRATCH,
-        0);
-}
diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h
index fbee7539cdfcaa3066a71b5b85eb243a903325dc..649330aa1871b6e60cd322ba34f49ba9d4839af8 100644
--- a/runtime/quark/include/core_blas_dag.h
+++ b/runtime/quark/include/core_blas_dag.h
@@ -14,7 +14,7 @@
  * @version 1.0.0
  * @author Mathieu Faverge
  * @author Cedric Castagnede
- * @date 2010-11-15
+ * @date 2018-11-08
  *
  */
 #ifndef _core_blas_dag_h_
@@ -71,16 +71,21 @@
 #define DAG_CORE_TRSM       DAG_SET_PROPERTIES( "TRSM"      , "cyan"    )
 #define DAG_CORE_TRSSQ      DAG_SET_PROPERTIES( "TRSSQ"     , "white"   )
 #define DAG_CORE_TRTRI      DAG_SET_PROPERTIES( "TRTRI"     , "white"   )
-#define DAG_CORE_TSLQT      DAG_SET_PROPERTIES( "TSLQT"     , "red"     )
-#define DAG_CORE_TSMLQ      DAG_SET_PROPERTIES( "TSMLQ"     , "yellow"  )
-#define DAG_CORE_TSMQR      DAG_SET_PROPERTIES( "TSMQR"     , "yellow"  )
-#define DAG_CORE_TSQRT      DAG_SET_PROPERTIES( "TSQRT"     , "red"     )
+#define DAG_CORE_TPLQT      DAG_SET_PROPERTIES( "TPLQT"     , "red"     )
+#define DAG_CORE_TPMLQT     DAG_SET_PROPERTIES( "TPMLQT"    , "yellow"  )
+#define DAG_CORE_TPMQRT     DAG_SET_PROPERTIES( "TPMQRT"    , "yellow"  )
+#define DAG_CORE_TPQRT      DAG_SET_PROPERTIES( "TPQRT"     , "red"     )
 #define DAG_CORE_TSTRF      DAG_SET_PROPERTIES( "TSTRF"     , "red"     )
-#define DAG_CORE_TTLQT      DAG_SET_PROPERTIES( "TTLQT"     , "pink"    )
-#define DAG_CORE_TTMLQ      DAG_SET_PROPERTIES( "TTMLQ"     , "magenta" )
-#define DAG_CORE_TTMQR      DAG_SET_PROPERTIES( "TTMQR"     , "magenta" )
-#define DAG_CORE_TTQRT      DAG_SET_PROPERTIES( "TTQRT"     , "pink"    )
 #define DAG_CORE_UNMLQ      DAG_SET_PROPERTIES( "UNMLQ"     , "cyan"    )
 #define DAG_CORE_UNMQR      DAG_SET_PROPERTIES( "UNMQR"     , "cyan"    )
 
+#define DAG_CORE_TSLQT      DAG_CORE_TPLQT
+#define DAG_CORE_TSMLQ      DAG_CORE_TPMLQT
+#define DAG_CORE_TSMQR      DAG_CORE_TPMQRT
+#define DAG_CORE_TSQRT      DAG_CORE_TPQRT
+#define DAG_CORE_TTLQT      DAG_CORE_TPLQT
+#define DAG_CORE_TTMLQ      DAG_CORE_TPMLQT
+#define DAG_CORE_TTMQR      DAG_CORE_TPMQRT
+#define DAG_CORE_TTQRT      DAG_CORE_TPQRT
+
 #endif /* _core_blas_dag_h_ */
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index c42035d967b99c896705c8341a70155e26678025..745b122e3f11a4c9d92a126369fb736d9b8bb4fe 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -15,7 +15,7 @@
  *  @author Mathieu Faverge
  *  @author Cedric Augonnet
  *  @author Florent Pruvost
- *  @date 2015-09-16
+ *  @date 2018-11-08
  *  @precisions normal z -> c d s
  *
  */
@@ -69,16 +69,8 @@ CHAMELEON_CL_CB(ztrasm,        starpu_matrix_get_nx(task->handles[0]), starpu_ma
 CHAMELEON_CL_CB(ztrmm,         starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0,                                               M*M*N)
 CHAMELEON_CL_CB(ztrsm,         starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0,                                               M*M*N)
 CHAMELEON_CL_CB(ztrtri,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M *M*M)
-CHAMELEON_CL_CB(ztslqt,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     2. *M* M*M)
-CHAMELEON_CL_CB(ztsmlq,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
-CHAMELEON_CL_CB(ztsmqr,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
 CHAMELEON_CL_CB(ztsmlq_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
 CHAMELEON_CL_CB(ztsmqr_hetra1, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (4.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
-CHAMELEON_CL_CB(ztsqrt,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     2. *M* M*M)
 CHAMELEON_CL_CB(ztstrf,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),         M* M*M)
-CHAMELEON_CL_CB(zttlqt,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     1. *M* M*M)
-CHAMELEON_CL_CB(zttmlq,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
-CHAMELEON_CL_CB(zttmqr,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2.0*M+starpu_matrix_get_nx(task->handles[3]))*M*M)
-CHAMELEON_CL_CB(zttqrt,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     1. *M* M*M)
 CHAMELEON_CL_CB(zunmlq,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     2. *M* M*M)
 CHAMELEON_CL_CB(zunmqr,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]),     2. *M* M*M)
diff --git a/runtime/starpu/codelets/codelet_ztpmlqt.c b/runtime/starpu/codelets/codelet_ztpmlqt.c
index 714b45762488101d571b7ed4a7f1132a256d5301..8dffa4ff2292c448c710713a7556bec3920f3d73 100644
--- a/runtime/starpu/codelets/codelet_ztpmlqt.c
+++ b/runtime/starpu/codelets/codelet_ztpmlqt.c
@@ -11,7 +11,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-07
  * @precisions normal z -> s d c
  *
  */
@@ -37,6 +37,7 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
     CHAMELEON_Complex64_t *B;
     int ldb;
     CHAMELEON_Complex64_t *WORK;
+    size_t lwork;
 
     V    = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
     T    = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -45,13 +46,15 @@ static void cl_ztpmlqt_cpu_func(void *descr[], void *cl_arg)
     WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
-                                &ldv, &ldt, &lda, &ldb );
+                                &ldv, &ldt, &lda, &ldb, &lwork );
 
     CORE_ztpmlqt( side, trans, M, N, K, L, ib,
                   V, ldv, T, ldt, A, lda, B, ldb, WORK );
+
+    (void)lwork;
 }
 
-#if defined(CHAMELEON_USE_CUDA) && 0
+#if defined(CHAMELEON_USE_CUDA)
 static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
 {
     cham_side_t side;
@@ -70,6 +73,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *B;
     int ldb;
     cuDoubleComplex *W;
+    size_t lwork;
 
     V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -78,14 +82,14 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
     W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
-                                &ldv, &ldt, &lda, &ldb );
+                                &ldv, &ldt, &lda, &ldb, &lwork );
 
     RUNTIME_getStream(stream);
 
     CUDA_ztpmlqt(
             side, trans, M, N, K, L, ib,
             V, ldv, T, ldt, A, lda, B, ldb,
-            W, stream );
+            W, lwork, stream );
 
 #ifndef STARPU_CUDA_ASYNC
     cudaStreamSynchronize( stream );
@@ -97,8 +101,7 @@ static void cl_ztpmlqt_cuda_func(void *descr[], void *cl_arg)
 /*
  * Codelet definition
  */
-CODELETS_CPU(ztpmlqt, 5, cl_ztpmlqt_cpu_func)
-//CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC)
+CODELETS(ztpmlqt, 5, cl_ztpmlqt_cpu_func, cl_ztpmlqt_cuda_func, STARPU_CUDA_ASYNC)
 
 void
 INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
@@ -136,6 +139,7 @@ INSERT_TASK_ztpmlqt( const RUNTIME_option_t *options,
         STARPU_VALUE, &lda,   sizeof(int),
         STARPU_RW,     RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
         STARPU_VALUE, &ldb,   sizeof(int),
+        STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
         /* Other options */
         STARPU_SCRATCH,   options->ws_worker,
         STARPU_PRIORITY,  options->priority,
diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c
index 40f83ab2d137bfe6c9de11d58a4ad05307edb78e..6684e59f82d579cd129a1ff40c25dd377bc6166e 100644
--- a/runtime/starpu/codelets/codelet_ztpmqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpmqrt.c
@@ -11,7 +11,7 @@
  *
  * @version 1.0.0
  * @author Mathieu Faverge
- * @date 2016-12-15
+ * @date 2018-11-07
  * @precisions normal z -> s d c
  *
  */
@@ -37,6 +37,7 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
     CHAMELEON_Complex64_t *B;
     int ldb;
     CHAMELEON_Complex64_t *WORK;
+    size_t lwork;
 
     V    = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
     T    = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
@@ -45,10 +46,12 @@ static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
     WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
-                                &ldv, &ldt, &lda, &ldb );
+                                &ldv, &ldt, &lda, &ldb, &lwork );
 
     CORE_ztpmqrt( side, trans, M, N, K, L, ib,
                   V, ldv, T, ldt, A, lda, B, ldb, WORK );
+
+    (void)lwork;
 }
 
 
@@ -71,22 +74,23 @@ static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
     cuDoubleComplex *B;
     int ldb;
     cuDoubleComplex *W;
+    size_t lwork;
 
     V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
     T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
     A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
     B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-    W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
+    W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 3*ib*nb */
 
     starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
-                                &ldv, &ldt, &lda, &ldb );
+                                &ldv, &ldt, &lda, &ldb, &lwork );
 
     RUNTIME_getStream(stream);
 
     CUDA_ztpmqrt(
             side, trans, M, N, K, L, ib,
             V, ldv, T, ldt, A, lda, B, ldb,
-            W, stream );
+            W, lwork, stream );
 
 #ifndef STARPU_CUDA_ASYNC
     cudaStreamSynchronize( stream );
@@ -102,12 +106,12 @@ CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYN
 
 void
 INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
-                    cham_side_t side, cham_trans_t trans,
-                    int M, int N, int K, int L, int ib, int nb,
-                    const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                    const CHAM_desc_t *T, int Tm, int Tn, int ldt,
-                    const CHAM_desc_t *A, int Am, int An, int lda,
-                    const CHAM_desc_t *B, int Bm, int Bn, int ldb )
+                     cham_side_t side, cham_trans_t trans,
+                     int M, int N, int K, int L, int ib, int nb,
+                     const CHAM_desc_t *V, int Vm, int Vn, int ldv,
+                     const CHAM_desc_t *T, int Tm, int Tn, int ldt,
+                     const CHAM_desc_t *A, int Am, int An, int lda,
+                     const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
     struct starpu_codelet *codelet = &cl_ztpmqrt;
     void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL;
@@ -136,6 +140,7 @@ INSERT_TASK_ztpmqrt( const RUNTIME_option_t *options,
         STARPU_VALUE, &lda,   sizeof(int),
         STARPU_RW,     RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
         STARPU_VALUE, &ldb,   sizeof(int),
+        STARPU_VALUE, &(options->ws_wsize), sizeof(size_t),
         /* Other options */
         STARPU_SCRATCH,   options->ws_worker,
         STARPU_PRIORITY,  options->priority,
diff --git a/runtime/starpu/codelets/codelet_ztslqt.c b/runtime/starpu/codelets/codelet_ztslqt.c
deleted file mode 100644
index 870e1349af880473ae1e92787e84e0a6ad3df035..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_ztslqt.c
+++ /dev/null
@@ -1,174 +0,0 @@
-/**
- *
- * @file starpu/codelet_ztslqt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztslqt StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztslqt computes a LQ factorization of a rectangular matrix
- *  formed by coupling side-by-side a complex M-by-M
- *  lower triangular tile A1 and a complex M-by-N tile A2:
- *
- *    | A1 A2 | = L * Q
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)', where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
- *  A2(i,1:n), and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A1 and A2. M >= 0.
- *         The number of columns of the tile A1.
- *
- * @param[in] N
- *         The number of columns of the tile A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M-by-M tile A1.
- *         On exit, the elements on and below the diagonal of the array
- *         contain the M-by-M lower trapezoidal tile L;
- *         the elements above the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N tile A2.
- *         On exit, all the elements with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_ztslqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_ztslqt;
-    void (*callback)(void*) = options->profiling ? cl_ztslqt_callback : NULL;
-    CHAMELEON_starpu_ws_t *h_work = (CHAMELEON_starpu_ws_t*)(options->ws_host);
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_W(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &m,                 sizeof(int),
-        STARPU_VALUE,    &n,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_W,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-        /* max( nb * (ib+1), ib * (ib+nb) ) */
-        STARPU_SCRATCH,   options->ws_worker,
-        /* /\* 2 * ib * (nb+ib) + nb *\/ */
-        STARPU_VALUE,    &h_work,            sizeof(CHAMELEON_starpu_ws_t *),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "ztslqt",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_ztslqt_cpu_func(void *descr[], void *cl_arg)
-{
-    CHAMELEON_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU, *WORK;
-
-    A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    T  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    TAU= (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb + ib*nb */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
-
-    WORK = TAU + chameleon_max( m, n );
-    CORE_ztslqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(ztslqt, 4, cl_ztslqt_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_ztsmlq.c b/runtime/starpu/codelets/codelet_ztsmlq.c
deleted file mode 100644
index b0a2e38ec8ead1eb249c0e29ba5f98f8a8e91a90..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_ztsmlq.c
+++ /dev/null
@@ -1,266 +0,0 @@
-/**
- *
- * @file starpu/codelet_ztsmlq.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmlq StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Azzam Haidar
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztsmlq overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |     | A1 A2 | * Q
- *                             | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |     | A1 A2 | * Q**H
- *                             | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)'
- *
- *  as returned by CORE_ZTSLQT.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *         M2 = M1 if side == ChamRight.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *         N2 = N1 if side == ChamLeft.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTSLQT in the first k rows of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size
- *             LDWORK-by-M1 if side == ChamLeft
- *             LDWORK-by-IB if side == ChamRight
- *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK.
- *             LDWORK >= max(1,IB) if side == ChamLeft
- *             LDWORK >= max(1,N1) if side == ChamRight
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_ztsmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    struct starpu_codelet *codelet = &cl_ztsmlq;
-    void (*callback)(void*) = options->profiling ? cl_ztsmlq_callback : NULL;
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_R(V, Vm, Vn);
-    CHAMELEON_ACCESS_R(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &side,              sizeof(int),
-        STARPU_VALUE,    &trans,             sizeof(int),
-        STARPU_VALUE,    &m1,                sizeof(int),
-        STARPU_VALUE,    &n1,                sizeof(int),
-        STARPU_VALUE,    &m2,                sizeof(int),
-        STARPU_VALUE,    &n2,                sizeof(int),
-        STARPU_VALUE,    &k,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_R,         RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),
-        STARPU_VALUE,    &ldv,               sizeof(int),
-        STARPU_R,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-        /* max( ib*nb, 2*ib*nb ) */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_VALUE,    &ldwork,            sizeof(int),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "ztsmlq",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_ztsmlq_cpu_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    A1   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
-    WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    CORE_ztsmlq(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-#if defined(CHAMELEON_USE_CUDA)
-static void cl_ztsmlq_cuda_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    cuDoubleComplex *A1;
-    int lda1;
-    cuDoubleComplex *A2;
-    int lda2;
-    cuDoubleComplex *V;
-    int ldv;
-    cuDoubleComplex *T;
-    int ldt;
-    cuDoubleComplex *W, *WC;
-    int ldwork;
-    int ldworkc;
-
-    A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-    W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    WC = W + ib * ldwork;
-    ldworkc = (side == ChamLeft) ? m1 : ib;
-
-    RUNTIME_getStream(stream);
-
-    CUDA_ztsmlq( side, trans, m1, n1, m2, n2, k, ib,
-                      A1, lda1, A2, lda2, V, ldv, T, ldt,
-                      W, ldwork, WC, ldworkc, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-}
-#endif /* defined(CHAMELEON_USE_CUDA) */
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS(ztsmlq, 5, cl_ztsmlq_cpu_func, cl_ztsmlq_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c
index 8996121be9fbce09fb360a4cbe0c7ec262cff5fc..d68e2bebf803ac1c449c30435dec5ea1883ba86e 100644
--- a/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c
+++ b/runtime/starpu/codelets/codelet_ztsmlq_hetra1.c
@@ -15,7 +15,7 @@
  * @author Hatem Ltaief
  * @author Mathieu Faverge
  * @author Azzam Haidar
- * @date 2010-11-15
+ * @date 2018-11-07
  * @precisions normal z -> c d s
  *
  */
@@ -106,8 +106,8 @@ static void cl_ztsmlq_hetra1_cpu_func(void *descr[], void *cl_arg)
     T     = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
     WORK  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
 
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k,
-                               &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork);
+    starpu_codelet_unpack_args( cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k,
+                                &ib, &nb, &lda1, &lda2, &ldv, &ldt, &ldwork);
     CORE_ztsmlq_hetra1(side, trans, m1, n1, m2, n2, k,
                        ib, A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
 }
diff --git a/runtime/starpu/codelets/codelet_ztsmqr.c b/runtime/starpu/codelets/codelet_ztsmqr.c
deleted file mode 100644
index c38a8fbd129656fc5681a25a00bd7a57cefdb6e9..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_ztsmqr.c
+++ /dev/null
@@ -1,271 +0,0 @@
-/**
- *
- * @file starpu/codelet_ztsmqr.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsmqr StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Azzam Haidar
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_ztsmqr overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |     | A1 A2 | * Q
- *                             | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |     | A1 A2 | * Q**H
- *                             | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_ZTSQRT.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *         M2 = M1 if side == ChamRight.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *         N2 = N1 if side == ChamLeft.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTSQRT in the first k columns of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size
- *             LDWORK-by-N1 if side == ChamLeft
- *             LDWORK-by-IB if side == ChamRight
- *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK.
- *             LDWORK >= max(1,IB) if side == ChamLeft
- *             LDWORK >= max(1,M1) if side == ChamRight
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_ztsmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_ztsmqr;
-    void (*callback)(void*) = options->profiling ? cl_ztsmqr_callback : NULL;
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_R(V, Vm, Vn);
-    CHAMELEON_ACCESS_R(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &side,              sizeof(int),
-        STARPU_VALUE,    &trans,             sizeof(int),
-        STARPU_VALUE,    &m1,                sizeof(int),
-        STARPU_VALUE,    &n1,                sizeof(int),
-        STARPU_VALUE,    &m2,                sizeof(int),
-        STARPU_VALUE,    &n2,                sizeof(int),
-        STARPU_VALUE,    &k,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_R,         RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),
-        STARPU_VALUE,    &ldv,               sizeof(int),
-        STARPU_R,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-        /* max( ib*nb, 2*ib*nb ) */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_VALUE,    &ldwork,            sizeof(int),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_USE_MPI)
-        STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n),
-#endif
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "ztsmqr",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_ztsmqr_cpu_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    A1   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
-    WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    CORE_ztsmqr(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-#if defined(CHAMELEON_USE_CUDA)
-static void cl_ztsmqr_cuda_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    cuDoubleComplex *A1;
-    int lda1;
-    cuDoubleComplex *A2;
-    int lda2;
-    cuDoubleComplex *V;
-    int ldv;
-    cuDoubleComplex *T;
-    int ldt;
-    cuDoubleComplex *W, *WC;
-    int ldwork;
-    int ldworkc;
-
-    A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-    W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    WC = W + ib * (side == ChamLeft ? m1 : n1);
-    ldworkc = (side == ChamLeft) ? m2 : ib;
-
-    RUNTIME_getStream(stream);
-
-    CUDA_ztsmqr(
-            side, trans, m1, n1, m2, n2, k, ib,
-            A1, lda1, A2, lda2, V, ldv, T, ldt,
-            W, ldwork, WC, ldworkc, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-}
-#endif /* defined(CHAMELEON_USE_CUDA) */
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS(ztsmqr, 5, cl_ztsmqr_cpu_func, cl_ztsmqr_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/codelets/codelet_ztsqrt.c b/runtime/starpu/codelets/codelet_ztsqrt.c
deleted file mode 100644
index cb93ba7c22c3f832f365f293545366f03863bd6c..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_ztsqrt.c
+++ /dev/null
@@ -1,166 +0,0 @@
-/**
- *
- * @file starpu/codelet_ztsqrt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon ztsqrt StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Jakub Kurzak
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * CORE_ztsqrt computes a QR factorization of a rectangular matrix
- * formed by coupling a complex N-by-N upper triangular tile A1
- * on top of a complex M-by-N tile A2:
- *
- *    | A1 | = Q * R
- *    | A2 |
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of columns of the tile A2. M >= 0.
- *
- * @param[in] N
- *         The number of rows of the tile A1.
- *         The number of columns of the tiles A1 and A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the N-by-N tile A1.
- *         On exit, the elements on and above the diagonal of the array
- *         contain the N-by-N upper trapezoidal tile R;
- *         the elements below the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N tile A2.
- *         On exit, all the elements with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_ztsqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_ztsqrt;
-    void (*callback)(void*) = options->profiling ? cl_ztsqrt_callback : NULL;
-    CHAMELEON_starpu_ws_t *h_work = (CHAMELEON_starpu_ws_t*)(options->ws_host);
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_W(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &m,                 sizeof(int),
-        STARPU_VALUE,    &n,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_W,         RTBLKADDR(T,  CHAMELEON_Complex64_t, Tm,  Tn ),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-        /* max( nb * (ib+1), ib * (ib+nb) ) */
-        STARPU_SCRATCH,   options->ws_worker,
-        /* 2 * ib * (nb+ib) + nb */
-        STARPU_VALUE,    &h_work,            sizeof(CHAMELEON_starpu_ws_t *),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_USE_MPI)
-        STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n),
-#endif
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "ztsqrt",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_ztsqrt_cpu_func(void *descr[], void *cl_arg)
-{
-    CHAMELEON_starpu_ws_t *h_work;
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU, *WORK;
-
-    A1 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2 = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    T  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    TAU= (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb + ib*nb */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt, &h_work);
-
-    WORK = TAU + chameleon_max( m, n );
-    CORE_ztsqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(ztsqrt, 4, cl_ztsqrt_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zttlqt.c b/runtime/starpu/codelets/codelet_zttlqt.c
deleted file mode 100644
index a673832082e5aaa466b31b53230f0f6421c77333..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_zttlqt.c
+++ /dev/null
@@ -1,173 +0,0 @@
-/**
- *
- * @file starpu/codelet_zttlqt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttlqt StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttlqt computes a LQ factorization of a rectangular matrix
- *  formed by coupling side-by-side a complex M-by-M lower triangular tile A1
- *  and a complex M-by-N lower triangular tile A2:
- *
- *    | A1 A2 | = L * Q
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(k)' . . . H(2)' H(1)', where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; conjg(v(i+1:n)) is stored on exit in
- *  A2(i,1:n), and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A1 and A2. M >= 0.
- *         The number of columns of the tile A1.
- *
- * @param[in] N
- *         The number of columns of the tile A2. N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M-by-M tile A1.
- *         On exit, the elements on and below the diagonal of the array
- *         contain the M-by-M lower trapezoidal tile L;
- *         the elements above the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1.  LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N lower triangular tile A2.
- *         On exit, the elements on and below the diagonal of the array
- *         with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the array A2.  LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[in,out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_zttlqt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zttlqt;
-    void (*callback)(void*) = options->profiling ? cl_zttlqt_callback : NULL;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_W(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &m,                 sizeof(int),
-        STARPU_VALUE,    &n,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_W,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-         /* nb * (ib+1) */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zttlqt",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zttlqt_cpu_func(void *descr[], void *cl_arg)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    A1  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb * (ib+1) */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt);
-
-    WORK = TAU + chameleon_max( m, n );
-
-    CORE_zttlqt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zttlqt, 4, cl_zttlqt_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zttmlq.c b/runtime/starpu/codelets/codelet_zttmlq.c
deleted file mode 100644
index c2924eafcef47724732584b52ab235208c578a78..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_zttmlq.c
+++ /dev/null
@@ -1,212 +0,0 @@
-/**
- *
- * @file starpu/codelet_zttmlq.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmlq StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttmlq overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 (N1 == N2) with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |       | A1 | * Q
- *                             | A2 |       | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |       | A1 | * Q**H
- *                             | A2 |       | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_zttqrt.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTTQRT in the first k rows of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[out] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size LDWORK-by-N1.
- *
- * @param[in] LDWORK
- *         The dimension of the array WORK. LDWORK >= max(1,IB).
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_zttmlq(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zttmlq;
-    void (*callback)(void*) = options->profiling ? cl_zttmlq_callback : NULL;
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_R(V, Vm, Vn);
-    CHAMELEON_ACCESS_R(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &side,              sizeof(int),
-        STARPU_VALUE,    &trans,             sizeof(int),
-        STARPU_VALUE,    &m1,                sizeof(int),
-        STARPU_VALUE,    &n1,                sizeof(int),
-        STARPU_VALUE,    &m2,                sizeof(int),
-        STARPU_VALUE,    &n2,                sizeof(int),
-        STARPU_VALUE,    &k,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_R,         RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),
-        STARPU_VALUE,    &ldv,               sizeof(int),
-        STARPU_R,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-         /* nb * ib */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_VALUE,    &ldwork,            sizeof(int),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zttmlq",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zttmlq_cpu_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    A1   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
-    WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* nb * ib */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    CORE_zttmlq(side, trans, m1, n1, m2, n2, k, ib, A1, lda1,
-                A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zttmlq, 5, cl_zttmlq_cpu_func)
diff --git a/runtime/starpu/codelets/codelet_zttmqr.c b/runtime/starpu/codelets/codelet_zttmqr.c
deleted file mode 100644
index d485d16b9bcf582e1a86489612ad61b7724eb004..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_zttmqr.c
+++ /dev/null
@@ -1,269 +0,0 @@
-/**
- *
- * @file starpu/codelet_zttmqr.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttmqr StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttmqr overwrites the general complex M1-by-N1 tile A1 and
- *  M2-by-N2 tile A2 (N1 == N2) with
- *
- *                        SIDE = 'L'        SIDE = 'R'
- *    TRANS = 'N':         Q * | A1 |       | A1 | * Q
- *                             | A2 |       | A2 |
- *
- *    TRANS = 'C':      Q**H * | A1 |       | A1 | * Q**H
- *                             | A2 |       | A2 |
- *
- *  where Q is a complex unitary matrix defined as the product of k
- *  elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k)
- *
- *  as returned by CORE_zttqrt.
- *
- *******************************************************************************
- *
- * @param[in] side
- *         @arg ChamLeft  : apply Q or Q**H from the Left;
- *         @arg ChamRight : apply Q or Q**H from the Right.
- *
- * @param[in] trans
- *         @arg ChamNoTrans   :  No transpose, apply Q;
- *         @arg ChamConjTrans :  ConjTranspose, apply Q**H.
- *
- * @param[in] M1
- *         The number of rows of the tile A1. M1 >= 0.
- *
- * @param[in] N1
- *         The number of columns of the tile A1. N1 >= 0.
- *
- * @param[in] M2
- *         The number of rows of the tile A2. M2 >= 0.
- *         M2 = M1 if side == ChamRight.
- *
- * @param[in] N2
- *         The number of columns of the tile A2. N2 >= 0.
- *         N2 = N1 if side == ChamLeft.
- *
- * @param[in] K
- *         The number of elementary reflectors whose product defines
- *         the matrix Q.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the M1-by-N1 tile A1.
- *         On exit, A1 is overwritten by the application of Q.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1. LDA1 >= max(1,M1).
- *
- * @param[in,out] A2
- *         On entry, the M2-by-N2 tile A2.
- *         On exit, A2 is overwritten by the application of Q.
- *
- * @param[in] LDA2
- *         The leading dimension of the tile A2. LDA2 >= max(1,M2).
- *
- * @param[in] V
- *         The i-th row must contain the vector which defines the
- *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTTQRT in the first k columns of its array argument V.
- *
- * @param[in] LDV
- *         The leading dimension of the array V. LDV >= max(1,K).
- *
- * @param[in] T
- *         The IB-by-N1 triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] WORK
- *         Workspace array of size
- *             LDWORK-by-N1 if side == ChamLeft
- *             LDWORK-by-IB if side == ChamRight
- *
- * @param[in] LDWORK
- *         The leading dimension of the array WORK.
- *             LDWORK >= max(1,IB) if side == ChamLeft
- *             LDWORK >= max(1,M1) if side == ChamRight
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_zttmqr(const RUNTIME_option_t *options,
-                       cham_side_t side, cham_trans_t trans,
-                       int m1, int n1, int m2, int n2, int k, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *V, int Vm, int Vn, int ldv,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zttmqr;
-    void (*callback)(void*) = options->profiling ? cl_zttmqr_callback : NULL;
-    int ldwork = side == ChamLeft ? ib : nb;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_R(V, Vm, Vn);
-    CHAMELEON_ACCESS_R(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &side,              sizeof(int),
-        STARPU_VALUE,    &trans,             sizeof(int),
-        STARPU_VALUE,    &m1,                sizeof(int),
-        STARPU_VALUE,    &n1,                sizeof(int),
-        STARPU_VALUE,    &m2,                sizeof(int),
-        STARPU_VALUE,    &n2,                sizeof(int),
-        STARPU_VALUE,    &k,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_R,         RTBLKADDR(V, CHAMELEON_Complex64_t, Vm, Vn),
-        STARPU_VALUE,    &ldv,               sizeof(int),
-        STARPU_R,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-        /* max( ib*nb, 2*ib*nb ) */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_VALUE,    &ldwork,            sizeof(int),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_USE_MPI)
-        STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n),
-#endif
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zttmqr",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zttmqr_cpu_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *V;
-    int ldv;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *WORK;
-    int ldwork;
-
-    A1   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
-    WORK = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    CORE_zttmqr(side, trans, m1, n1, m2, n2, k, ib,
-                A1, lda1, A2, lda2, V, ldv, T, ldt, WORK, ldwork);
-}
-
-#if defined(CHAMELEON_USE_CUDA)
-static void cl_zttmqr_cuda_func(void *descr[], void *cl_arg)
-{
-    cham_side_t side;
-    cham_trans_t trans;
-    int m1;
-    int n1;
-    int m2;
-    int n2;
-    int k;
-    int ib;
-    cuDoubleComplex *A1;
-    int lda1;
-    cuDoubleComplex *A2;
-    int lda2;
-    cuDoubleComplex *V;
-    int ldv;
-    cuDoubleComplex *T;
-    int ldt;
-    cuDoubleComplex *W, *WC;
-    int ldwork;
-    int ldworkc;
-
-    A1 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2 = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
-    V  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
-    T  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
-    W  = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
-
-    starpu_codelet_unpack_args(cl_arg, &side, &trans, &m1, &n1, &m2, &n2, &k, &ib,
-                               &lda1, &lda2, &ldv, &ldt, &ldwork);
-
-    WC = W + ib * (side == ChamLeft ? m1 : n1);
-    ldworkc = (side == ChamLeft) ? m2 : ib;
-
-    RUNTIME_getStream(stream);
-
-    CUDA_zttmqr(
-            side, trans, m1, n1, m2, n2, k, ib,
-            A1, lda1, A2, lda2, V, ldv, T, ldt,
-            W, ldwork, WC, ldworkc, stream );
-
-#ifndef STARPU_CUDA_ASYNC
-    cudaStreamSynchronize( stream );
-#endif
-}
-#endif /* defined(CHAMELEON_USE_CUDA) */
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS(zttmqr, 5, cl_zttmqr_cpu_func, cl_zttmqr_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/codelets/codelet_zttqrt.c b/runtime/starpu/codelets/codelet_zttqrt.c
deleted file mode 100644
index 39d52185fdc68654e966b34db700b31733cde5c6..0000000000000000000000000000000000000000
--- a/runtime/starpu/codelets/codelet_zttqrt.c
+++ /dev/null
@@ -1,176 +0,0 @@
-/**
- *
- * @file starpu/codelet_zttqrt.c
- *
- * @copyright 2009-2014 The University of Tennessee and The University of
- *                      Tennessee Research Foundation. All rights reserved.
- * @copyright 2012-2018 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
- *                      Univ. Bordeaux. All rights reserved.
- *
- ***
- *
- * @brief Chameleon zttqrt StarPU codelet
- *
- * @version 1.0.0
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 1.0.0
- * @author Hatem Ltaief
- * @author Dulceneia Becker
- * @author Mathieu Faverge
- * @author Emmanuel Agullo
- * @author Cedric Castagnede
- * @date 2010-11-15
- * @precisions normal z -> c d s
- *
- */
-#include "chameleon_starpu.h"
-#include "runtime_codelet_z.h"
-
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- *  CORE_zttqrt computes a QR factorization of a rectangular matrix
- *  formed by coupling a complex N-by-N upper triangular tile A1
- *  on top of a complex M-by-N upper trapezoidal tile A2:
- *
- *    | A1 | = Q * R
- *    | A2 |
- *
- *  The tile Q is represented as a product of elementary reflectors
- *
- *    Q = H(1) H(2) . . . H(k), where k = min(M,N).
- *
- *  Each H(i) has the form
- *
- *    H(i) = I - tau * v * v'
- *
- *  where tau is a complex scalar, and v is a complex vector with
- *  v(1:i-1) = 0 and v(i) = 1; v(i+1:m) is stored on exit in A2(1:m,i),
- *  and tau in TAU(i).
- *
- *******************************************************************************
- *
- * @param[in] M
- *         The number of rows of the tile A2.  M >= 0.
- *
- * @param[in] N
- *         The number of columns of the tile A1 and A2.  N >= 0.
- *
- * @param[in] IB
- *         The inner-blocking size.  IB >= 0.
- *
- * @param[in,out] A1
- *         On entry, the N-by-N tile A1.
- *         On exit, the elements on and above the diagonal of the array
- *         contain the N-by-N upper trapezoidal tile R;
- *         the elements below the diagonal are not referenced.
- *
- * @param[in] LDA1
- *         The leading dimension of the array A1.  LDA1 >= max(1,N).
- *
- * @param[in,out] A2
- *         On entry, the M-by-N upper triangular tile A2.
- *         On exit, the elements on and above the diagonal of the array
- *         with the array TAU, represent
- *         the unitary tile Q as a product of elementary reflectors
- *         (see Further Details).
- *
- * @param[in] LDA2
- *         The leading dimension of the array A2.  LDA2 >= max(1,M).
- *
- * @param[out] T
- *         The IB-by-N triangular factor T of the block reflector.
- *         T is upper triangular by block (economic storage);
- *         The rest of the array is not referenced.
- *
- * @param[in] LDT
- *         The leading dimension of the array T. LDT >= IB.
- *
- * @param[out] TAU
- *         The scalar factors of the elementary reflectors (see Further
- *         Details).
- *
- * @param[in,out] WORK
- *
- *******************************************************************************
- *
- * @return
- *          \retval CHAMELEON_SUCCESS successful exit
- *          \retval <0 if -i, the i-th argument had an illegal value
- *
- */
-
-void INSERT_TASK_zttqrt(const RUNTIME_option_t *options,
-                       int m, int n, int ib, int nb,
-                       const CHAM_desc_t *A1, int A1m, int A1n, int lda1,
-                       const CHAM_desc_t *A2, int A2m, int A2n, int lda2,
-                       const CHAM_desc_t *T, int Tm, int Tn, int ldt)
-{
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zttqrt;
-    void (*callback)(void*) = options->profiling ? cl_zttqrt_callback : NULL;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A1, A1m, A1n);
-    CHAMELEON_ACCESS_RW(A2, A2m, A2n);
-    CHAMELEON_ACCESS_W(T, Tm, Tn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &m,                 sizeof(int),
-        STARPU_VALUE,    &n,                 sizeof(int),
-        STARPU_VALUE,    &ib,                sizeof(int),
-        STARPU_RW,        RTBLKADDR(A1, CHAMELEON_Complex64_t, A1m, A1n),
-        STARPU_VALUE,    &lda1,              sizeof(int),
-        STARPU_RW,        RTBLKADDR(A2, CHAMELEON_Complex64_t, A2m, A2n),
-        STARPU_VALUE,    &lda2,              sizeof(int),
-        STARPU_W,         RTBLKADDR(T, CHAMELEON_Complex64_t, Tm, Tn),
-        STARPU_VALUE,    &ldt,               sizeof(int),
-         /* nb * (ib+1) */
-        STARPU_SCRATCH,   options->ws_worker,
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-#if defined(CHAMELEON_USE_MPI)
-        STARPU_EXECUTE_ON_NODE, A2->get_rankof(A2, A2m, A2n),
-#endif
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zttqrt",
-#endif
-        0);
-}
-
-
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zttqrt_cpu_func(void *descr[], void *cl_arg)
-{
-    int m;
-    int n;
-    int ib;
-    CHAMELEON_Complex64_t *A1;
-    int lda1;
-    CHAMELEON_Complex64_t *A2;
-    int lda2;
-    CHAMELEON_Complex64_t *T;
-    int ldt;
-    CHAMELEON_Complex64_t *TAU;
-    CHAMELEON_Complex64_t *WORK;
-
-    A1  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    A2  = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
-    T   = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
-    TAU = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* nb * (ib+1) */
-
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &lda1, &lda2, &ldt);
-
-    WORK = TAU + chameleon_max( m, n );
-
-    CORE_zttqrt(m, n, ib, A1, lda1, A2, lda2, T, ldt, TAU, WORK);
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zttqrt, 4, cl_zttqrt_cpu_func)
diff --git a/runtime/starpu/control/runtime_zlocality.c b/runtime/starpu/control/runtime_zlocality.c
index 0b2a7bc282dd6d39e852cb6d2679367d8b213f31..025e300c62775d79287593f316fb5671dea40345 100644
--- a/runtime/starpu/control/runtime_zlocality.c
+++ b/runtime/starpu/control/runtime_zlocality.c
@@ -15,7 +15,7 @@
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
- * @date 2011-06-01
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -58,23 +58,15 @@ void RUNTIME_zlocality_allrestrict( uint32_t where )
 
     /* QR */
     cl_zgeqrt_restrict_where( where );
-    cl_ztsqrt_restrict_where( where );
+    cl_ztpqrt_restrict_where( where );
     cl_zunmqr_restrict_where( where );
-    cl_ztsmqr_restrict_where( where );
-
-    /* QR-RH */
-/*     cl_zttqrt_restrict_where( where ); */
-/*     cl_zttmqr_restrict_where( where ); */
+    cl_ztpmqrt_restrict_where( where );
 
     /* LQ */
    cl_zgelqt_restrict_where( where );
-   cl_ztslqt_restrict_where( where );
+   cl_ztplqt_restrict_where( where );
    cl_zunmlq_restrict_where( where );
-   cl_ztsmlq_restrict_where( where );
-
-    /* LQ-RH */
-/*     cl_zttlqt_restrict_where( where ); */
-/*     cl_zttmlq_restrict_where( where ); */
+   cl_ztpmlqt_restrict_where( where );
 
 }
 
@@ -112,23 +104,15 @@ void RUNTIME_zlocality_onerestrict( cham_tasktype_t kernel, uint32_t where )
 
     /* QR */
     case TASK_GEQRT:  cl_zgeqrt_restrict_where( where ); break;
+    case TASK_TPQRT:  cl_ztpqrt_restrict_where( where ); break;
     case TASK_UNMQR:  cl_zunmqr_restrict_where( where ); break;
-    case TASK_TSMQR:  cl_ztsmqr_restrict_where( where ); break;
-    case TASK_TSQRT:  cl_ztsqrt_restrict_where( where ); break;
-
-    /* QR-RH */
-/*     case TASK_TTMQR:  cl_zttmqr_restrict_where( where ); break; */
-/*     case TASK_TTQRT:  cl_zttqrt_restrict_where( where ); break; */
+    case TASK_TPMQRT: cl_ztpmqrt_restrict_where( where ); break;
 
     /* LQ */
    case TASK_GELQT:  cl_zgelqt_restrict_where( where ); break;
+   case TASK_TPLQT:  cl_ztplqt_restrict_where( where ); break;
    case TASK_UNMLQ:  cl_zunmlq_restrict_where( where ); break;
-   case TASK_TSMLQ:  cl_ztsmlq_restrict_where( where ); break;
-   case TASK_TSLQT:  cl_ztslqt_restrict_where( where ); break;
-
-    /* LQ-RH */
-/*     case TASK_TTMLQ:  cl_zttmlq_restrict_where( where ); break; */
-/*     case TASK_TTLQT:  cl_zttlqt_restrict_where( where ); break; */
+   case TASK_TPMLQT: cl_ztpmlqt_restrict_where( where ); break;
 
     default:
       return;
@@ -167,23 +151,15 @@ void RUNTIME_zlocality_allrestore( )
 
     /* QR */
     cl_zgeqrt_restore_where();
-    cl_ztsqrt_restore_where();
+    cl_ztpqrt_restore_where();
     cl_zunmqr_restore_where();
-    cl_ztsmqr_restore_where();
-
-    /* QR-RH */
-/*     cl_zttqrt_restore_where(); */
-/*     cl_zttmqr_restore_where(); */
+    cl_ztpmqrt_restore_where();
 
     /* LQ */
    cl_zgelqt_restore_where();
-   cl_ztslqt_restore_where();
+   cl_ztplqt_restore_where();
    cl_zunmlq_restore_where();
-   cl_ztsmlq_restore_where();
-
-    /* LQ-RH */
-/*     cl_zttlqt_restore_where(); */
-/*     cl_zttmlq_restore_where(); */
+   cl_ztpmlqt_restore_where();
 
 }
 
@@ -221,23 +197,15 @@ void RUNTIME_zlocality_onerestore( cham_tasktype_t kernel )
 
     /* QR */
     case TASK_GEQRT:  cl_zgeqrt_restore_where(); break;
+    case TASK_TPQRT:  cl_ztpqrt_restore_where(); break;
     case TASK_UNMQR:  cl_zunmqr_restore_where(); break;
-    case TASK_TSMQR:  cl_ztsmqr_restore_where(); break;
-    case TASK_TSQRT:  cl_ztsqrt_restore_where(); break;
-
-    /* QR-RH */
-/*     case TASK_TTMQR:  cl_zttmqr_restore_where(); break; */
-/*     case TASK_TTQRT:  cl_zttqrt_restore_where(); break; */
+    case TASK_TPMQRT: cl_ztpmqrt_restore_where(); break;
 
     /* LQ */
    case TASK_GELQT:  cl_zgelqt_restore_where(); break;
+   case TASK_TPLQT:  cl_ztplqt_restore_where(); break;
    case TASK_UNMLQ:  cl_zunmlq_restore_where(); break;
-   case TASK_TSMLQ:  cl_ztsmlq_restore_where(); break;
-   case TASK_TSLQT:  cl_ztslqt_restore_where(); break;
-
-    /* LQ-RH */
-/*     case TASK_TTMLQ:  cl_zttmlq_restore_where(); break; */
-/*     case TASK_TTLQT:  cl_zttlqt_restore_where(); break; */
+   case TASK_TPMLQT: cl_ztpmlqt_restore_where(); break;
 
     default:
       return;
diff --git a/runtime/starpu/control/runtime_zprofiling.c b/runtime/starpu/control/runtime_zprofiling.c
index 82af2b32836b8e3d6400f17cadd86b8f4f02de00..848746b7f5e4a9c64f060d06393983382907e0f1 100644
--- a/runtime/starpu/control/runtime_zprofiling.c
+++ b/runtime/starpu/control/runtime_zprofiling.c
@@ -15,7 +15,7 @@
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
- * @date 2011-06-01
+ * @date 2018-11-08
  * @precisions normal z -> s d c
  *
  */
@@ -43,21 +43,18 @@ void RUNTIME_zdisplay_allprofile()
     profiling_display_zgelqt_info();
     profiling_display_zgeqrt_info();
     profiling_display_zgessm_info();
-    profiling_display_zgetrf_info();
     profiling_display_zgetrf_incpiv_info();
+    profiling_display_zgetrf_info();
     profiling_display_zgetrf_nopiv_info();
     profiling_display_zlauum_info();
     profiling_display_zpotrf_info();
     profiling_display_zssssm_info();
+    profiling_display_ztplqt_info();
+    profiling_display_ztpmlqt_info();
+    profiling_display_ztpmqrt_info();
+    profiling_display_ztpqrt_info();
     profiling_display_ztrtri_info();
-    profiling_display_ztslqt_info();
-    profiling_display_ztsmqr_info();
-    profiling_display_ztsqrt_info();
     profiling_display_ztstrf_info();
-    profiling_display_zttlqt_info();
-    profiling_display_zttmlq_info();
-    profiling_display_zttmqr_info();
-    profiling_display_zttqrt_info();
     profiling_display_zunmlq_info();
     profiling_display_zunmqr_info();
 
@@ -78,7 +75,7 @@ void RUNTIME_zdisplay_oneprofile( cham_tasktype_t kernel )
     case TASK_SYMM:         profiling_display_zsymm_info();         break;
     case TASK_SYR2K:        profiling_display_zsyr2k_info();        break;
     case TASK_SYRK:         profiling_display_zsyrk_info();         break;
-    case TASK_TRMM:         profiling_display_ztrmm_info();         break; 
+    case TASK_TRMM:         profiling_display_ztrmm_info();         break;
     case TASK_TRSM:         profiling_display_ztrsm_info();         break;
 
         /* Lapack */
@@ -92,14 +89,13 @@ void RUNTIME_zdisplay_oneprofile( cham_tasktype_t kernel )
     case TASK_POTRF:        profiling_display_zpotrf_info();        break;
     case TASK_SSSSM:        profiling_display_zssssm_info();        break;
     case TASK_TRTRI:        profiling_display_ztrtri_info();        break;
-    case TASK_TSLQT:        profiling_display_ztslqt_info();        break;
-    case TASK_TSMQR:        profiling_display_ztsmqr_info();        break;
-    case TASK_TSQRT:        profiling_display_ztsqrt_info();        break;
     case TASK_TSTRF:        profiling_display_ztstrf_info();        break;
-    case TASK_TTLQT:        profiling_display_zttlqt_info();        break;
-    case TASK_TTMLQ:        profiling_display_zttmlq_info();        break;
-    case TASK_TTMQR:        profiling_display_zttmqr_info();        break;
-    case TASK_TTQRT:        profiling_display_zttqrt_info();        break;
+
+    case TASK_TPLQT:        profiling_display_ztplqt_info();        break;
+    case TASK_TPMLQT:       profiling_display_ztpmlqt_info();        break;
+    case TASK_TPMQRT:       profiling_display_ztpmqrt_info();        break;
+    case TASK_TPQRT:        profiling_display_ztpqrt_info();        break;
+
     case TASK_UNMLQ:        profiling_display_zunmlq_info();        break;
     case TASK_UNMQR:        profiling_display_zunmqr_info();        break;
 
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 30b5f282db608afbb653daadf4b2c5159d2142be..509abacfc9a3dd9c2fd09729f8a7e7a351778476 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -15,7 +15,7 @@
  * @author Cedric Augonnet
  * @author Mathieu Faverge
  * @author Cedric Castagnede
- * @date 2011-06-01
+ * @date 2018-11-08
  * @precisions normal z -> c d s
  *
  */
@@ -78,17 +78,9 @@ ZCODELETS_HEADER(tplqt)
 ZCODELETS_HEADER(tpqrt)
 ZCODELETS_HEADER(tpmlqt)
 ZCODELETS_HEADER(tpmqrt)
-ZCODELETS_HEADER(tslqt)
-ZCODELETS_HEADER(tsmlq)
-ZCODELETS_HEADER(tsmqr)
 ZCODELETS_HEADER(tsmlq_hetra1)
 ZCODELETS_HEADER(tsmqr_hetra1)
-ZCODELETS_HEADER(tsqrt)
 ZCODELETS_HEADER(tstrf)
-ZCODELETS_HEADER(ttlqt)
-ZCODELETS_HEADER(ttmlq)
-ZCODELETS_HEADER(ttmqr)
-ZCODELETS_HEADER(ttqrt)
 ZCODELETS_HEADER(unmlq)
 ZCODELETS_HEADER(unmqr)