From 9c2e2baf7419b9c623743d4415080c45bfbb2d35 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 15 Dec 2016 22:34:35 +0100
Subject: [PATCH 1/8] Add tpqrt and tpmqrt kernels

---
 coreblas/compute/core_ztpmqrt.c | 190 ++++++++++++++++++++++++++++++++
 coreblas/compute/core_ztpqrt.c  | 159 ++++++++++++++++++++++++++
 coreblas/include/coreblas_z.h   |  12 ++
 3 files changed, 361 insertions(+)
 create mode 100644 coreblas/compute/core_ztpmqrt.c
 create mode 100644 coreblas/compute/core_ztpqrt.c

diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c
new file mode 100644
index 000000000..571da630a
--- /dev/null
+++ b/coreblas/compute/core_ztpmqrt.c
@@ -0,0 +1,190 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_ztpmqrt.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "coreblas/include/coreblas.h"
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_MORSE_Complex64_t
+ *
+ * CORE_ztpmqrt applies a complex orthogonal matrix Q obtained from a
+ * "triangular-pentagonal" complex block reflector H to a general complex matrix
+ * C, which consists of two blocks A and B.
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *         @arg MorseLeft  : apply Q or Q**H from the Left;
+ *         @arg MorseRight : apply Q or Q**H from the Right.
+ *
+ * @param[in] trans
+ *         @arg MorseNoTrans   :  No transpose, apply Q;
+ *         @arg MorseConjTrans :  ConjTranspose, apply Q**H.
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N1
+ *         The number of columns of the tile B. N >= 0.
+ *
+ * @param[in] K
+ *         The number of elementary reflectors whose product defines
+ *         the matrix Q.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          K >= L >= 0.  See Further Details.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] V
+ *         The i-th row must contain the vector which defines the
+ *         elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *         CORE_ZTTQRT in the first k rows of its array argument V.
+ *
+ * @param[in] LDV
+ *         The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[in] T
+ *         The IB-by-N1 triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[in,out] A
+ *         A is COMPLEX*16 array, dimension (LDA,N) if side = MorseLeft
+ *         or (LDA,K) if SIDE = MorseRight
+ *         On entry, the K-by-N or M-by-K matrix A.
+ *         On exit, A is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A. LDA1 >= max(1,M1).
+ *         If side = MorseLeft,  LDA >= max(1,K);
+ *         If side = Morseright, LDA >= max(1,M).
+ *
+ * @param[in,out] B
+ *         On entry, the M-by-N tile B.
+ *         On exit, B is overwritten by the corresponding block of
+ *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
+ *
+ * @param[in] LDB
+ *         The leading dimension of the tile B. LDB >= max(1,M).
+ *
+ * @param[out] WORK
+ *         Workspace array of size LDWORK-by-NB.
+ *         LDWORK = N if side =MorseLeft, or  M if side = MorseRight.
+ *
+ *******************************************************************************
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The columns of the pentagonal matrix V contain the elementary reflectors
+ *  H(1), H(2), ..., H(K); V is composed of a rectangular block V1 and a
+ *  trapezoidal block V2:
+ *
+ *        V = [V1]
+ *            [V2].
+ *
+ *  The size of the trapezoidal block V2 is determined by the parameter L,
+ *  where 0 <= L <= K; V2 is upper trapezoidal, consisting of the first L
+ *  rows of a K-by-K upper triangular matrix.  If L=K, V2 is upper triangular;
+ *  if L=0, there is no trapezoidal block, hence V = V1 is rectangular.
+ *
+ *  If side = MorseLeft:  C = [A]  where A is K-by-N,  B is M-by-N and V is M-by-K.
+ *                            [B]
+ *
+ *  If side = MorseRight: C = [A B]  where A is M-by-K, B is M-by-N and V is N-by-K.
+ *
+ *  The complex orthogonal matrix Q is formed from V and T.
+ *
+ *  If trans='N' and side='L', C is on exit replaced with Q * C.
+ *
+ *  If trans='C' and side='L', C is on exit replaced with Q**H * C.
+ *
+ *  If trans='N' and side='R', C is on exit replaced with C * Q.
+ *
+ *  If trans='C' and side='R', C is on exit replaced with C * Q**H.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+
+int CORE_ztpmqrt( MORSE_enum side, MORSE_enum trans,
+                  int M, int N, int K, int L, int IB,
+                  const MORSE_Complex64_t *V, int LDV,
+                  const MORSE_Complex64_t *T, int LDT,
+                  MORSE_Complex64_t *A, int LDA,
+                  MORSE_Complex64_t *B, int LDB,
+                  MORSE_Complex64_t *WORK )
+{
+    int m1;
+    int n1;
+
+    /* Check input arguments */
+    if ((side != MorseLeft) && (side != MorseRight)) {
+        coreblas_error(1, "Illegal value of side");
+        return -1;
+    }
+
+    if ( side == MorseLeft ) {
+        m1 = K;
+        n1 = N;
+        ldwork = ib;
+    }
+    else {
+        m1 = M;
+        n1 = K;
+        ldwork = m1;
+    }
+
+    /* TS case */
+    if (L == 0) {
+        CORE_ztsmqr( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, ldwork );
+    }
+    /* TT case */
+    else  if( L == M ) {
+        CORE_zttmqr( side, trans, m1, n1, M, N, K, IB,
+                     A, LDA, B, LDB, V, LDV, T, LDT,
+                     WORK, ldwork );
+    }
+    else {
+        //LAPACKE_ztpmqrt_work( LAPACK_COL_MAJOR, M, N, K, L, IB, V, LDV, T, LDT, A, LDA, B, LDB, WORK );
+        coreblas_error( 3, "Illegal value of L (only 0 or M handled for now)");
+        return -3;
+    }
+
+    return MORSE_SUCCESS;
+}
diff --git a/coreblas/compute/core_ztpqrt.c b/coreblas/compute/core_ztpqrt.c
new file mode 100644
index 000000000..76a94fc2c
--- /dev/null
+++ b/coreblas/compute/core_ztpqrt.c
@@ -0,0 +1,159 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file core_ztpqrt.c
+ *
+ *  PLASMA core_blas kernel
+ *  PLASMA is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> c d s
+ *
+ **/
+#include "coreblas/include/lapacke.h"
+#include "coreblas/include/coreblas.h"
+
+/**
+ ******************************************************************************
+ *
+ * @ingroup CORE_MORSE_Complex64_t
+ *
+ * CORE_ztpqrt computes a blocked QR factorization of a complex
+ * "triangular-pentagonal" matrix C, which is composed of a
+ * triangular block A and pentagonal block B, using the compact
+ * WY representation for Q.
+ *
+ *  C = | A | = Q * R
+ *      | B |
+ *
+ *******************************************************************************
+ *
+ * @param[in] M
+ *         The number of rows of the tile B. M >= 0.
+ *
+ * @param[in] N
+ *         The number of rows of the tile A1.
+ *         The number of columns of the tiles A1 and A2. N >= 0.
+ *
+ * @param[in] IB
+ *         The inner-blocking size.  IB >= 0.
+ *
+ * @param[in] N
+ *          The number of columns of the matrix B, and the order of the matrix
+ *          A. N >= 0.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of B.
+ *          MIN(M,N) >= L >= 0.  See Further Details.
+ *
+ * @param[in,out] A
+ *          On entry, the upper triangular N-by-N matrix A.
+ *          On exit, the elements on and above the diagonal of the array
+ *          contain the upper triangular matrix R.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,N).
+ *
+ * @param[in,out] B
+ *          On entry, the pentagonal M-by-N matrix B.  The first M-L rows
+ *          are rectangular, and the last L rows are upper trapezoidal.
+ *          On exit, B contains the pentagonal matrix V.  See Further Details.
+ *
+ * @param[in] LDB
+ *          The leading dimension of the array B.  LDB >= max(1,M).
+ *
+ * @param[out] T
+ *         The IB-by-N triangular factor T of the block reflector.
+ *         T is upper triangular by block (economic storage);
+ *         The rest of the array is not referenced.
+ *
+ * @param[in] LDT
+ *         The leading dimension of the array T. LDT >= IB.
+ *
+ * @param[out] TAU
+ *         The scalar factors of the elementary reflectors (see Further
+ *         Details).
+ *
+ * @param[out] WORK
+ *          WORK is COMPLEX*16 array, dimension ((IB+1)*N)
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ ******************************************************************************/
+int CORE_ztpqrt( int M, int N, int L, int IB,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB,
+                 MORSE_Complex64_t *T, int LDT,
+                 MORSE_Complex64_t *WORK )
+{
+    static MORSE_Complex64_t zone  = 1.0;
+    static MORSE_Complex64_t zzero = 0.0;
+
+    MORSE_Complex64_t alpha;
+    int i, ii, sb;
+
+#if !defined(NDEBUG)
+     /* Check input arguments */
+    if (M < 0) {
+        coreblas_error(1, "Illegal value of M");
+        return -1;
+    }
+    if (N < 0) {
+        coreblas_error(2, "Illegal value of N");
+        return -2;
+    }
+    if( (L < 0) || ((L > min(M, N)) && (min(M,N) > 0))) {
+        coreblas_error(3, "Illegal value of L");
+        return -3;
+    }
+    if (IB < 0) {
+        coreblas_error(4, "Illegal value of IB");
+        return -4;
+    }
+    if ((LDA < max(1,N)) && (N > 0)) {
+        coreblas_error(6, "Illegal value of LDA");
+        return -6;
+    }
+    if ((LDB < max(1,M)) && (M > 0)) {
+        coreblas_error(6, "Illegal value of LDB");
+        return -8;
+    }
+    if ((LDT < max(1,IB)) && (IB > 0)) {
+        coreblas_error(6, "Illegal value of LDT");
+        return -10;
+    }
+#endif /*!defined(NDEBUG)*/
+
+    /* Quick return */
+    if ((M == 0) || (N == 0) || (IB == 0))
+        return MORSE_SUCCESS;
+
+    if ( L == O ) {
+        CORE_ztsqrt( M, N, IB, A, LDA, B, LDB, T, LDT, WORK, WORK+N );
+    }
+    else if (L == M) {
+        CORE_zttqrt( M, N, IB, A, LDA, B, LDB, T, LDT, WORK, WORK+N );
+    }
+    else {
+        //LAPACKE_ztpqrt_work( LAPACK_COL_MAJOR, M, N, L, IB, A, LDA, B, LDB, T, LDT, WORK );
+        coreblas_error( 3, "Illegal value of L (only 0 or M handled for now)");
+        return -3;
+    }
+    return MORSE_SUCCESS;
+}
diff --git a/coreblas/include/coreblas_z.h b/coreblas/include/coreblas_z.h
index e94455562..345836f81 100644
--- a/coreblas/include/coreblas_z.h
+++ b/coreblas/include/coreblas_z.h
@@ -352,6 +352,18 @@ int  CORE_ztstrf(int M, int N, int IB, int NB,
                  MORSE_Complex64_t *L, int LDL,
                  int *IPIV, MORSE_Complex64_t *WORK,
                  int LDWORK, int *INFO);
+int CORE_ztpqrt( int M, int N, int L, int IB,
+                 MORSE_Complex64_t *A, int LDA,
+                 MORSE_Complex64_t *B, int LDB,
+                 MORSE_Complex64_t *T, int LDT,
+                 MORSE_Complex64_t *WORK );
+int CORE_ztpmqrt( MORSE_enum side, MORSE_enum trans,
+                  int M, int N, int K, int L, int IB,
+                  const MORSE_Complex64_t *V, int LDV,
+                  const MORSE_Complex64_t *T, int LDT,
+                  MORSE_Complex64_t *A, int LDA,
+                  MORSE_Complex64_t *B, int LDB,
+                  MORSE_Complex64_t *WORK );
 int  CORE_zttmqr(MORSE_enum side, MORSE_enum trans,
                  int M1, int N1, int M2, int N2, int K, int IB,
                  MORSE_Complex64_t *A1, int LDA1,
-- 
GitLab


From 3b69d027e31ec84bc0af7f341934b19710c166bc Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 15 Dec 2016 22:35:22 +0100
Subject: [PATCH 2/8] Add task interfaces to tpqrt/tpmqrt tasks

---
 include/morse_z.h   |  3 +++
 include/runtime_z.h | 12 ++++++++++++
 2 files changed, 15 insertions(+)

diff --git a/include/morse_z.h b/include/morse_z.h
index f01b0c778..07c81112a 100644
--- a/include/morse_z.h
+++ b/include/morse_z.h
@@ -102,6 +102,7 @@ int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, MORSE_Complex64
 int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);
 int MORSE_zsysv(MORSE_enum uplo, int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
 int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
+int MORSE_ztpqrt( int M, int N, int L, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, MORSE_desc_t *descT );
 int MORSE_ztradd(MORSE_enum uplo, MORSE_enum trans, int M, int N, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t beta, MORSE_Complex64_t *B, int LDB);
 int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, int N, int NRHS, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
 int MORSE_ztrsm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, int N, int NRHS, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
@@ -179,6 +180,7 @@ int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha,
 int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C);
 int MORSE_zsysv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
 int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+int MORSE_ztpqrt_Tile( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T );
 int MORSE_ztradd_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B);
 int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B);
 int MORSE_ztrsm_Tile(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B);
@@ -253,6 +255,7 @@ int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, M
 int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
+int MORSE_ztpqrt_Tile_Async( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request );
 int MORSE_ztradd_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_ztrsm_Tile_Async(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
diff --git a/include/runtime_z.h b/include/runtime_z.h
index df6b2f553..d4e434a8c 100644
--- a/include/runtime_z.h
+++ b/include/runtime_z.h
@@ -315,6 +315,18 @@ void MORSE_TASK_zswptr_ontile(const MORSE_option_t *options,
                               const MORSE_desc_t descA, const MORSE_desc_t *Aij, int Aijm, int Aijn,
                               int i1,  int i2, int *ipiv, int inc,
                               const MORSE_desc_t *Akk, int Akkm, int Akkn, int ldak);
+void MORSE_TASK_ztpmqrt(const MORSE_option_t *options,
+                        MORSE_enum side, MORSE_enum trans,
+                        int m, int n, int k, int l, int ib, int nb,
+                        const MORSE_desc_t *V, int Vm, int Vn, int ldv,
+                        const MORSE_desc_t *T, int Tm, int Tn, int ldt,
+                        const MORSE_desc_t *A, int Am, int An, int lda,
+                        const MORSE_desc_t *B, int Bm, int Bn, int ldb );
+void MORSE_TASK_ztpqrt(const MORSE_option_t *options,
+                       int m, int n, int l, int ib, int nb,
+                       const MORSE_desc_t *A, int Am, int An, int lda,
+                       const MORSE_desc_t *B, int Bm, int Bn, int ldb,
+                       const MORSE_desc_t *T, int Tm, int Tn, int ldt );
 void MORSE_TASK_ztrdalg(const MORSE_option_t *options,
                         MORSE_enum uplo,
                         int N, int NB,
-- 
GitLab


From 9224a46894beec01ed0cd13e812a315691eb4424 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 15 Dec 2016 22:47:55 +0100
Subject: [PATCH 3/8] Add compilation of the kernels

---
 coreblas/compute/CMakeLists.txt | 2 ++
 coreblas/compute/core_ztpmqrt.c | 5 ++---
 coreblas/compute/core_ztpqrt.c  | 8 +-------
 3 files changed, 5 insertions(+), 10 deletions(-)

diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index 82813231b..d24bc295a 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -73,6 +73,8 @@ set(ZSRC
     core_zsyrk.c
     core_zsyssq.c
     core_zsytf2_nopiv.c
+    core_ztpqrt.c
+    core_ztpmqrt.c
     core_ztradd.c
     core_ztrasm.c
     core_ztrmm.c
diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c
index 571da630a..ee44b8cf8 100644
--- a/coreblas/compute/core_ztpmqrt.c
+++ b/coreblas/compute/core_ztpmqrt.c
@@ -148,8 +148,7 @@ int CORE_ztpmqrt( MORSE_enum side, MORSE_enum trans,
                   MORSE_Complex64_t *B, int LDB,
                   MORSE_Complex64_t *WORK )
 {
-    int m1;
-    int n1;
+    int m1, n1, ldwork;
 
     /* Check input arguments */
     if ((side != MorseLeft) && (side != MorseRight)) {
@@ -160,7 +159,7 @@ int CORE_ztpmqrt( MORSE_enum side, MORSE_enum trans,
     if ( side == MorseLeft ) {
         m1 = K;
         n1 = N;
-        ldwork = ib;
+        ldwork = IB;
     }
     else {
         m1 = M;
diff --git a/coreblas/compute/core_ztpqrt.c b/coreblas/compute/core_ztpqrt.c
index 76a94fc2c..504951a0c 100644
--- a/coreblas/compute/core_ztpqrt.c
+++ b/coreblas/compute/core_ztpqrt.c
@@ -102,12 +102,6 @@ int CORE_ztpqrt( int M, int N, int L, int IB,
                  MORSE_Complex64_t *T, int LDT,
                  MORSE_Complex64_t *WORK )
 {
-    static MORSE_Complex64_t zone  = 1.0;
-    static MORSE_Complex64_t zzero = 0.0;
-
-    MORSE_Complex64_t alpha;
-    int i, ii, sb;
-
 #if !defined(NDEBUG)
      /* Check input arguments */
     if (M < 0) {
@@ -144,7 +138,7 @@ int CORE_ztpqrt( int M, int N, int L, int IB,
     if ((M == 0) || (N == 0) || (IB == 0))
         return MORSE_SUCCESS;
 
-    if ( L == O ) {
+    if ( L == 0 ) {
         CORE_ztsqrt( M, N, IB, A, LDA, B, LDB, T, LDT, WORK, WORK+N );
     }
     else if (L == M) {
-- 
GitLab


From eb02ab84224ada97e65fc71cd43b3d3727473021 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Thu, 15 Dec 2016 22:52:10 +0100
Subject: [PATCH 4/8] Add now driver and parallel implementation

---
 compute/CMakeLists.txt |   2 +
 compute/pztpqrt.c      | 151 +++++++++++++++++
 compute/ztpqrt.c       | 361 +++++++++++++++++++++++++++++++++++++++++
 control/compute_z.h    |   1 +
 4 files changed, 515 insertions(+)
 create mode 100644 compute/pztpqrt.c
 create mode 100644 compute/ztpqrt.c

diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 1d9b97f04..8d61cde93 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -130,6 +130,7 @@ set(ZSRC
     pzunmlqrh.c
     pzunmqr.c
     pzunmqrrh.c
+    pztpqrt.c
     ###
     zgels.c
     zgelqf.c
@@ -167,6 +168,7 @@ set(ZSRC
     zungqr.c
     zunmlq.c
     zunmqr.c
+    ztpqrt.c
     ##################
     # MIXED PRECISION
     ##################
diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c
new file mode 100644
index 000000000..8dd8c6335
--- /dev/null
+++ b/compute/pztpqrt.c
@@ -0,0 +1,151 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file pztpqrt.c
+ *
+ *  MORSE computational routines
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+#define A(m,n) A,  m,  n
+#define B(m,n) B,  m,  n
+#define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
+#define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
+
+/***************************************************************************//**
+ *  Parallel tile QR factorization - dynamic scheduling
+ **/
+void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
+                    MORSE_sequence_t *sequence, MORSE_request_t *request )
+{
+    MORSE_context_t *morse;
+    MORSE_option_t options;
+    size_t ws_worker = 0;
+    size_t ws_host = 0;
+    MORSE_desc_t *DIAG = NULL;
+
+    int k, m, n;
+    int ldak, ldbm;
+    int tempkm, tempkn, tempnn, tempmm, templm;
+    int ib;
+
+    /* Dimension of the first column */
+    int maxm  = B->m - L;
+    int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1);
+
+    morse = morse_context_self();
+    if (sequence->status != MORSE_SUCCESS)
+        return;
+    RUNTIME_options_init(&options, morse, sequence, request);
+
+    ib = MORSE_IB;
+
+    /*
+     * zgeqrt = A->nb * (ib+1)
+     * zunmqr = A->nb * ib
+     * ztsqrt = A->nb * (ib+1)
+     * ztsmqr = A->nb * ib
+     */
+    ws_worker = A->nb * (ib+1);
+
+    /* Allocation of temporary (scratch) working space */
+#if defined(CHAMELEON_USE_CUDA)
+    /* Worker space
+     *
+     * zunmqr = A->nb * ib
+     * ztsmqr = 2 * A->nb * ib
+     */
+    ws_worker = max( ws_worker, ib * A->nb * 2 );
+#endif
+
+#if defined(CHAMELEON_USE_MAGMA)
+    /* Worker space
+     *
+     * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
+     * ztsqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
+     */
+    ws_worker = max( ws_worker, ib * (ib + A->nb) );
+
+    /* Host space
+     *
+     * zgeqrt = ib * (A->mb+3*ib) + A->mb )
+     * ztsqrt = 2 * ib * (A->nb+ib) + A->nb
+     */
+    ws_host = max( ws_host, ib * (A->mb + 3 * ib) + A->mb );
+    ws_host = max( ws_host,  2 * ib * (A->nb + ib) + A->nb );
+#endif
+
+    ws_worker *= sizeof(MORSE_Complex64_t);
+    ws_host   *= sizeof(MORSE_Complex64_t);
+
+    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
+
+#if defined(CHAMELEON_COPY_DIAG)
+    /* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
+#endif
+
+    for (k = 0; k < A->nt; k++) {
+        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        ldak = BLKLDD(A, k);
+
+        for (m = 0; m < maxmt; m++) {
+            tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
+            templm = m == maxmt-1 ? tempmm       : 0;
+            ldbm = BLKLDD(B, m);
+            MORSE_TASK_ztpqrt(
+                &options,
+                tempmm, tempkn, templm, ib, T->nb,
+                A(k, k), ldak,
+                B(m, k), ldbm,
+                T(m, k), T->mb );
+
+            for (n = k+1; n < B->nt; n++) {
+                tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
+                MORSE_TASK_ztpmqrt(
+                    &options,
+                    MorseLeft, MorseConjTrans,
+                    tempmm, tempnn, tempkm, templm, ib, T->nb,
+                    B(m, k), ldbm,
+                    T(m, k), T->mb,
+                    A(k, n), ldak,
+                    B(m, n), ldbm );
+            }
+        }
+
+        maxmt = min( B->mt, maxmt+1 );
+    }
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, morse);
+    MORSE_TASK_dataflush_all();
+
+#if defined(CHAMELEON_COPY_DIAG)
+    MORSE_Sequence_Wait(sequence);
+    morse_desc_mat_free(DIAG);
+    free(DIAG);
+#endif
+    (void)DIAG;
+}
diff --git a/compute/ztpqrt.c b/compute/ztpqrt.c
new file mode 100644
index 000000000..ef9e4232e
--- /dev/null
+++ b/compute/ztpqrt.c
@@ -0,0 +1,361 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file ztpqrt.c
+ *
+ *  MORSE computational routines
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+/**
+ ******************************************************************************
+ *
+ * @ingroup MORSE_Complex64_t
+ *
+ *  MORSE_ztpqrt - Computes a blocked QR factorization of a
+ *  "triangular-pentagonal" matrix C, which is composed of a triangular block A
+ *  and a pentagonal block B, using the compact representation for Q.
+ *
+ *******************************************************************************
+ *
+ * @param[in] M
+ *          The number of rows of the matrix B. M >= 0.
+ *
+ * @param[in] N
+ *          The number of columns of the matrix B, and the order of the matrix
+ *          A. N >= 0.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of B.
+ *          MIN(M,N) >= L >= 0.  See Further Details.
+ *
+ * @param[in,out] A
+ *          On entry, the upper triangular N-by-N matrix A.
+ *          On exit, the elements on and above the diagonal of the array
+ *          contain the upper triangular matrix R.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,N).
+ *
+ * @param[in,out] B
+ *          On entry, the pentagonal M-by-N matrix B.  The first M-L rows
+ *          are rectangular, and the last L rows are upper trapezoidal.
+ *          On exit, B contains the pentagonal matrix V.  See Further Details.
+ *
+ * @param[in] LDB
+ *          The leading dimension of the array B.  LDB >= max(1,M).
+ *
+ * @param[out] descT
+ *          On exit, auxiliary factorization data, required by MORSE_zgeqrs to
+ *          solve the system of equations, or by any function to apply the Q.
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The input matrix C is a (N+M)-by-N matrix
+ *
+ *               C = [ A ]
+ *                   [ B ]
+ *
+ *  where A is an upper triangular N-by-N matrix, and B is M-by-N pentagonal
+ *  matrix consisting of a (M-L)-by-N rectangular matrix B1 on top of a L-by-N
+ *  upper trapezoidal matrix B2:
+ *
+ *               B = [ B1 ]  <- (M-L)-by-N rectangular
+ *                   [ B2 ]  <-     L-by-N upper trapezoidal.
+ *
+ *  The upper trapezoidal matrix B2 consists of the first L rows of a
+ *  N-by-N upper triangular matrix, where 0 <= L <= MIN(M,N).  If L=0,
+ *  B is rectangular M-by-N; if M=L=N, B is upper triangular.
+ *
+ *  The matrix W stores the elementary reflectors H(i) in the i-th column
+ *  below the diagonal (of A) in the (N+M)-by-N input matrix C
+ *
+ *               C = [ A ]  <- upper triangular N-by-N
+ *                   [ B ]  <- M-by-N pentagonal
+ *
+ *  so that W can be represented as
+ *
+ *               W = [ I ]  <- identity, N-by-N
+ *                   [ V ]  <- M-by-N, same form as B.
+ *
+ *  Thus, all of information needed for W is contained on exit in B, which
+ *  we call V above.  Note that V has the same form as B; that is,
+ *
+ *               V = [ V1 ] <- (M-L)-by-N rectangular
+ *                   [ V2 ] <-     L-by-N upper trapezoidal.
+ *
+ *  The columns of V represent the vectors which define the H(i)'s.
+ *
+ *  The number of blocks is B = ceiling(N/NB), where each
+ *  block is of order NB except for the last block, which is of order
+ *  IB = N - (B-1)*NB.  For each of the B blocks, a upper triangular block
+ *  reflector factor is computed: T1, T2, ..., TB.  The NB-by-NB (and IB-by-IB
+ *  for the last block) T's are stored in the NB-by-N matrix T as
+ *
+ *               T = [T1 T2 ... TB].
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpqrt_Tile
+ * @sa MORSE_ztpqrt_Tile_Async
+ * @sa MORSE_ctpqrt
+ * @sa MORSE_dtpqrt
+ * @sa MORSE_stpqrt
+ * @sa MORSE_zgeqrs
+ *
+ ******************************************************************************/
+int MORSE_ztpqrt( int M, int N, int L,
+                  MORSE_Complex64_t *A, int LDA,
+                  MORSE_Complex64_t *B, int LDB,
+                  MORSE_desc_t *descT )
+{
+    int NB;
+    int status;
+    MORSE_context_t *morse;
+    MORSE_sequence_t *sequence = NULL;
+    MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
+    MORSE_desc_t descA, descB;
+    int minMN = min( M, N );
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_fatal_error("MORSE_ztpqrt", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+
+    /* Check input arguments */
+    if (M < 0) {
+        morse_error("MORSE_ztpqrt", "illegal value of M");
+        return -1;
+    }
+    if (N < 0) {
+        morse_error("MORSE_ztpqrt", "illegal value of N");
+        return -2;
+    }
+    if ((L < 0) || ((L > minMN) && (minMN > 0))) {
+        morse_error("MORSE_ztpqrt", "illegal value of N");
+        return -3;
+    }
+    if (LDA < max(1, N)) {
+        morse_error("MORSE_ztpqrt", "illegal value of LDA");
+        return -5;
+    }
+    if (LDB < max(1, M)) {
+        morse_error("MORSE_ztpqrt", "illegal value of LDB");
+        return -7;
+    }
+
+    /* Quick return */
+    if (minMN == 0)
+        return MORSE_SUCCESS;
+
+    /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
+    status = morse_tune(MORSE_FUNC_ZGELS, M, N, 0);
+    if (status != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpqrt", "morse_tune() failed");
+        return status;
+    }
+
+    /* Set NT */
+    NB = MORSE_NB;
+
+    morse_sequence_create(morse, &sequence);
+
+/*    if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) {*/
+        morse_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, N, N, sequence, &request,
+                            morse_desc_mat_free(&(descA)) );
+        morse_zooplap2tile( descB, B, NB, NB, LDB, N, 0, 0, M, N, sequence, &request,
+                            (morse_desc_mat_free(&(descA)), morse_desc_mat_free(&(descB))) );
+/*    } else {*/
+/*        morse_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, M, N,*/
+/*                            sequence, &request);*/
+/*    }*/
+
+    /* Call the tile interface */
+    MORSE_ztpqrt_Tile_Async(L, &descA, &descB, descT, sequence, &request);
+
+/*    if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) {*/
+        morse_zooptile2lap(descA, A, NB, NB, LDA, N, sequence, &request);
+        morse_zooptile2lap(descB, B, NB, NB, LDB, N, sequence, &request);
+        morse_sequence_wait(morse, sequence);
+        morse_desc_mat_free(&descA);
+        morse_desc_mat_free(&descB);
+/*    } else {*/
+/*        morse_ziptile2lap( descA, A, NB, NB, LDA, N,  sequence, &request);*/
+/*        morse_ziptile2lap( descB, B, NB, NB, LDB, N,  sequence, &request);*/
+/*        morse_sequence_wait(morse, sequence);*/
+/*    }*/
+
+    status = sequence->status;
+    morse_sequence_destroy(morse, sequence);
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup MORSE_Complex64_t_Tile
+ *
+ *  MORSE_ztpqrt_Tile - Computes the tile QR factorization of a matrix.
+ *  Tile equivalent of MORSE_ztpqrt().
+ *  Operates on matrices stored by tiles.
+ *  All matrices are passed through descriptors.
+ *  All dimensions are taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] A
+ *          On entry, the M-by-N matrix A.
+ *          On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N
+ *          upper trapezoidal matrix R (R is upper triangular if M >= N); the elements below the
+ *          diagonal represent the unitary matrix Q as a product of elementary reflectors stored
+ *          by tiles.
+ *
+ * @param[out] T
+ *          On exit, auxiliary factorization data, required by MORSE_zgeqrs to solve the system
+ *          of equations.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpqrt
+ * @sa MORSE_ztpqrt_Tile_Async
+ * @sa MORSE_ctpqrt_Tile
+ * @sa MORSE_dtpqrt_Tile
+ * @sa MORSE_stpqrt_Tile
+ * @sa MORSE_zgeqrs_Tile
+ *
+ ******************************************************************************/
+int MORSE_ztpqrt_Tile( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T )
+{
+    MORSE_context_t *morse;
+    MORSE_sequence_t *sequence = NULL;
+    MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
+    int status;
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_fatal_error("MORSE_ztpqrt_Tile", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+    morse_sequence_create(morse, &sequence);
+    MORSE_ztpqrt_Tile_Async(L, A, B, T, sequence, &request);
+    morse_sequence_wait(morse, sequence);
+    RUNTIME_desc_getoncpu(B);
+
+    status = sequence->status;
+    morse_sequence_destroy(morse, sequence);
+    return status;
+}
+
+/***************************************************************************//**
+ *
+ * @ingroup MORSE_Complex64_t_Tile_Async
+ *
+ *  MORSE_ztpqrt_Tile_Async - Computes the tile QR factorization of a matrix.
+ *  Non-blocking equivalent of MORSE_ztpqrt_Tile().
+ *  May return before the computation is finished.
+ *  Allows for pipelining of operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpqrt
+ * @sa MORSE_ztpqrt_Tile
+ * @sa MORSE_ctpqrt_Tile_Async
+ * @sa MORSE_dtpqrt_Tile_Async
+ * @sa MORSE_stpqrt_Tile_Async
+ * @sa MORSE_zgeqrs_Tile_Async
+ *
+ ******************************************************************************/
+int MORSE_ztpqrt_Tile_Async( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
+                             MORSE_sequence_t *sequence, MORSE_request_t *request )
+{
+    MORSE_context_t *morse;
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_ztpqrt_Tile", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        morse_fatal_error("MORSE_ztpqrt_Tile", "NULL sequence");
+        return MORSE_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        morse_fatal_error("MORSE_ztpqrt_Tile", "NULL request");
+        return MORSE_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == MORSE_SUCCESS)
+        request->status = MORSE_SUCCESS;
+    else
+        return morse_request_fail(sequence, request, MORSE_ERR_SEQUENCE_FLUSHED);
+
+    /* Check descriptors for correctness */
+    if (morse_desc_check(A) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpqrt_Tile", "invalid first descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (morse_desc_check(B) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpqrt_Tile", "invalid second descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (morse_desc_check(T) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpqrt_Tile", "invalid third descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        morse_error("MORSE_ztpqrt_Tile", "only square tiles supported");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (((B->m - L) % B->mb) != 0) {
+        morse_error("MORSE_ztpqrt_Tile", "Triangular part must be aligned with tiles");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+
+    /* if (morse->householder == MORSE_FLAT_HOUSEHOLDER) { */
+    morse_pztpqrt(L, A, B, T, sequence, request);
+    /* } */
+    /* else { */
+    /*    morse_pztpqrtrh(A, T, MORSE_RHBLK, sequence, request); */
+    /* } */
+
+    return MORSE_SUCCESS;
+}
diff --git a/control/compute_z.h b/control/compute_z.h
index fd6051a49..d99406b14 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -134,6 +134,7 @@ void morse_pzsyrk(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MO
 void morse_pzsyr2k(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzsytrf(MORSE_enum uplo, MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztile2band(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *descAB, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request );
 void morse_pztradd(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztrsm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
-- 
GitLab


From 22869cafe224b2ea75a63cc543f36d292bc388e9 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Fri, 16 Dec 2016 00:06:47 +0100
Subject: [PATCH 5/8] Add codelets in all three runtimes

---
 coreblas/compute/core_ztpmqrt.c               |   2 +-
 runtime/CMakeLists.txt                        |  78 +++++++++
 runtime/parsec/CMakeLists.txt                 |  72 +-------
 runtime/parsec/codelets/codelet_ztpmqrt.c     | 102 +++++++++++
 runtime/parsec/codelets/codelet_ztpqrt.c      |  85 ++++++++++
 runtime/quark/CMakeLists.txt                  |  72 +-------
 runtime/quark/codelets/codelet_ztpmqrt.c      |  84 +++++++++
 runtime/quark/codelets/codelet_ztpqrt.c       |  72 ++++++++
 runtime/starpu/CMakeLists.txt                 |  72 +-------
 runtime/starpu/codelets/codelet_zcallback.c   |   4 +-
 runtime/starpu/codelets/codelet_ztpmqrt.c     | 159 ++++++++++++++++++
 runtime/starpu/codelets/codelet_ztpqrt.c      |  99 +++++++++++
 .../starpu/include/runtime_codelet_profile.h  |   2 +-
 runtime/starpu/include/runtime_codelet_z.h    |   2 +
 runtime/starpu/include/runtime_codelets.h     |   2 +-
 runtime/starpu/include/runtime_workspace.h    |   8 +-
 16 files changed, 694 insertions(+), 221 deletions(-)
 create mode 100644 runtime/parsec/codelets/codelet_ztpmqrt.c
 create mode 100644 runtime/parsec/codelets/codelet_ztpqrt.c
 create mode 100644 runtime/quark/codelets/codelet_ztpmqrt.c
 create mode 100644 runtime/quark/codelets/codelet_ztpqrt.c
 create mode 100644 runtime/starpu/codelets/codelet_ztpmqrt.c
 create mode 100644 runtime/starpu/codelets/codelet_ztpqrt.c

diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c
index ee44b8cf8..2241b5d39 100644
--- a/coreblas/compute/core_ztpmqrt.c
+++ b/coreblas/compute/core_ztpmqrt.c
@@ -98,7 +98,7 @@
  *
  * @param[out] WORK
  *         Workspace array of size LDWORK-by-NB.
- *         LDWORK = N if side =MorseLeft, or  M if side = MorseRight.
+ *         LDWORK = N if side = MorseLeft, or  M if side = MorseRight.
  *
  *******************************************************************************
  *
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index b0d76eeba..0224e3644 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -26,6 +26,84 @@
 #
 ###
 
+# List of codelets required by all runtimes
+# -----------------------------------------
+set(CODELETS_ZSRC
+    codelets/codelet_ztile_zero.c
+    codelets/codelet_zasum.c
+    ##################
+    # BLAS 1
+    ##################
+    codelets/codelet_zaxpy.c
+    ##################
+    # BLAS 3
+    ##################
+    codelets/codelet_zgemm.c
+    codelets/codelet_zhemm.c
+    codelets/codelet_zher2k.c
+    codelets/codelet_zherk.c
+    codelets/codelet_zsymm.c
+    codelets/codelet_zsyr2k.c
+    codelets/codelet_zsyrk.c
+    codelets/codelet_ztrmm.c
+    codelets/codelet_ztrsm.c
+    ##################
+    # LAPACK
+    ##################
+    codelets/codelet_zgeadd.c
+    codelets/codelet_zlascal.c
+    codelets/codelet_zgelqt.c
+    codelets/codelet_zgeqrt.c
+    codelets/codelet_zgessm.c
+    codelets/codelet_zgessq.c
+    codelets/codelet_zgetrf.c
+    codelets/codelet_zgetrf_incpiv.c
+    codelets/codelet_zgetrf_nopiv.c
+    codelets/codelet_zhe2ge.c
+    codelets/codelet_zherfb.c
+    codelets/codelet_zhessq.c
+    codelets/codelet_zlacpy.c
+    codelets/codelet_zlange.c
+    codelets/codelet_zlanhe.c
+    codelets/codelet_zlansy.c
+    codelets/codelet_zlantr.c
+    codelets/codelet_zlaset2.c
+    codelets/codelet_zlaset.c
+    codelets/codelet_zlatro.c
+    codelets/codelet_zlauum.c
+    codelets/codelet_zplghe.c
+    codelets/codelet_zplgsy.c
+    codelets/codelet_zplrnt.c
+    codelets/codelet_zplssq.c
+    codelets/codelet_zpotrf.c
+    codelets/codelet_zssssm.c
+    codelets/codelet_zsyssq.c
+    codelets/codelet_zsytrf_nopiv.c
+    codelets/codelet_ztpqrt.c
+    codelets/codelet_ztpmqrt.c
+    codelets/codelet_ztradd.c
+    codelets/codelet_ztrasm.c
+    codelets/codelet_ztrssq.c
+    codelets/codelet_ztrtri.c
+    codelets/codelet_ztslqt.c
+    codelets/codelet_ztsmlq.c
+    codelets/codelet_ztsmqr.c
+    codelets/codelet_ztsmlq_hetra1.c
+    codelets/codelet_ztsmqr_hetra1.c
+    codelets/codelet_ztsqrt.c
+    codelets/codelet_ztstrf.c
+    codelets/codelet_zttlqt.c
+    codelets/codelet_zttmlq.c
+    codelets/codelet_zttmqr.c
+    codelets/codelet_zttqrt.c
+    codelets/codelet_zunmlq.c
+    codelets/codelet_zunmqr.c
+    ##################
+    # BUILD
+    ##################
+    codelets/codelet_zbuild.c
+    )
+
 # Check for the subdirectories
 # ----------------------------
 if( CHAMELEON_SCHED_QUARK )
diff --git a/runtime/parsec/CMakeLists.txt b/runtime/parsec/CMakeLists.txt
index a19890afe..872c19c48 100644
--- a/runtime/parsec/CMakeLists.txt
+++ b/runtime/parsec/CMakeLists.txt
@@ -88,77 +88,7 @@ set(RUNTIME_COMMON
 # ------------------------------------------------------
 set(RUNTIME_SRCS_GENERATED "")
 set(ZSRC
-    codelets/codelet_ztile_zero.c
-    codelets/codelet_zasum.c
-    ##################
-    # BLAS 1
-    ##################
-    codelets/codelet_zaxpy.c
-    ##################
-    # BLAS 3
-    ##################
-    codelets/codelet_zgemm.c
-    codelets/codelet_zhemm.c
-    codelets/codelet_zher2k.c
-    codelets/codelet_zherk.c
-    codelets/codelet_zsymm.c
-    codelets/codelet_zsyr2k.c
-    codelets/codelet_zsyrk.c
-    codelets/codelet_ztrmm.c
-    codelets/codelet_ztrsm.c
-    ##################
-    # LAPACK
-    ##################
-    codelets/codelet_zgeadd.c
-    codelets/codelet_zlascal.c
-    codelets/codelet_zgelqt.c
-    codelets/codelet_zgeqrt.c
-    codelets/codelet_zgessm.c
-    codelets/codelet_zgessq.c
-    codelets/codelet_zgetrf.c
-    codelets/codelet_zgetrf_incpiv.c
-    codelets/codelet_zgetrf_nopiv.c
-    codelets/codelet_zhe2ge.c
-    codelets/codelet_zherfb.c
-    codelets/codelet_zhessq.c
-    codelets/codelet_zlacpy.c
-    codelets/codelet_zlange.c
-    codelets/codelet_zlanhe.c
-    codelets/codelet_zlansy.c
-    codelets/codelet_zlantr.c
-    codelets/codelet_zlaset2.c
-    codelets/codelet_zlaset.c
-    codelets/codelet_zlatro.c
-    codelets/codelet_zlauum.c
-    codelets/codelet_zplghe.c
-    codelets/codelet_zplgsy.c
-    codelets/codelet_zplrnt.c
-    codelets/codelet_zplssq.c
-    codelets/codelet_zpotrf.c
-    codelets/codelet_zssssm.c
-    codelets/codelet_zsyssq.c
-    codelets/codelet_zsytrf_nopiv.c
-    codelets/codelet_ztradd.c
-    codelets/codelet_ztrasm.c
-    codelets/codelet_ztrssq.c
-    codelets/codelet_ztrtri.c
-    codelets/codelet_ztslqt.c
-    codelets/codelet_ztsmlq.c
-    codelets/codelet_ztsmqr.c
-    codelets/codelet_ztsmlq_hetra1.c
-    codelets/codelet_ztsmqr_hetra1.c
-    codelets/codelet_ztsqrt.c
-    codelets/codelet_ztstrf.c
-    codelets/codelet_zttlqt.c
-    codelets/codelet_zttmlq.c
-    codelets/codelet_zttmqr.c
-    codelets/codelet_zttqrt.c
-    codelets/codelet_zunmlq.c
-    codelets/codelet_zunmqr.c
-    ##################
-    # BUILD
-    ##################
-    codelets/codelet_zbuild.c
+    ${CODELETS_ZSRC}
     )
 
 precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
diff --git a/runtime/parsec/codelets/codelet_ztpmqrt.c b/runtime/parsec/codelets/codelet_ztpmqrt.c
new file mode 100644
index 000000000..612e9d54a
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_ztpmqrt.c
@@ -0,0 +1,102 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/parsec/include/morse_parsec.h"
+
+static int
+CORE_ztpmqrt_parsec(dague_execution_unit_t    *context,
+                    dague_execution_context_t *this_task)
+{
+    MORSE_enum *side;
+    MORSE_enum *trans;
+    int *M;
+    int *N;
+    int *K;
+    int *L;
+    int *ib;
+    const MORSE_Complex64_t *V;
+    int *ldv;
+    const MORSE_Complex64_t *T;
+    int *ldt;
+    MORSE_Complex64_t *A;
+    int *lda;
+    MORSE_Complex64_t *B;
+    int *ldb;
+    MORSE_Complex64_t *WORK;
+
+    dague_dtd_unpack_args(
+        this_task,
+        UNPACK_VALUE,   &side,
+        UNPACK_VALUE,   &trans,
+        UNPACK_VALUE,   &M,
+        UNPACK_VALUE,   &N,
+        UNPACK_VALUE,   &K,
+        UNPACK_VALUE,   &L,
+        UNPACK_VALUE,   &ib,
+        UNPACK_DATA,    &V,
+        UNPACK_VALUE,   &ldv,
+        UNPACK_DATA,    &T,
+        UNPACK_VALUE,   &ldt,
+        UNPACK_DATA,    &A,
+        UNPACK_VALUE,   &lda,
+        UNPACK_DATA,    &B,
+        UNPACK_VALUE,   &ldb,
+        UNPACK_SCRATCH, &WORK );
+
+    CORE_ztpmqrt( *side, *trans, *M, *N, *K, *L, *ib,
+                  V, *ldv, T, *ldt, A, *lda, B, *ldb, WORK );
+
+    return 0;
+}
+
+void MORSE_TASK_ztpmqrt( const MORSE_option_t *options,
+                         MORSE_enum side, MORSE_enum trans,
+                         int M, int N, int K, int L, int ib, int nb,
+                         const MORSE_desc_t *V, int Vm, int Vn, int ldv,
+                         const MORSE_desc_t *T, int Tm, int Tn, int ldt,
+                         const MORSE_desc_t *A, int Am, int An, int lda,
+                         const MORSE_desc_t *B, int Bm, int Bn, int ldb )
+{
+    dague_dtd_handle_t* DAGUE_dtd_handle = (dague_dtd_handle_t *)(options->sequence->schedopt);
+
+    dague_insert_task(
+        DAGUE_dtd_handle, CORE_ztpmqrt_parsec, "tpmqrt",
+        sizeof(MORSE_enum), &side,  VALUE,
+        sizeof(MORSE_enum), &trans, VALUE,
+        sizeof(int),        &M,     VALUE,
+        sizeof(int),        &N,     VALUE,
+        sizeof(int),        &K,     VALUE,
+        sizeof(int),        &L,     VALUE,
+        sizeof(int),        &ib,    VALUE,
+        PASSED_BY_REF,       RTBLKADDR( V, MORSE_Complex64_t, Vm, Vn ), INPUT | REGION_FULL,
+        sizeof(int),        &ldv,   VALUE,
+        PASSED_BY_REF,       RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INPUT | REGION_FULL,
+        sizeof(int),        &ldt,   VALUE,
+        PASSED_BY_REF,       RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | REGION_FULL,
+        sizeof(int),        &lda,   VALUE,
+        PASSED_BY_REF,       RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT | REGION_FULL,
+        sizeof(int),        &ldb,   VALUE,
+        sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH,
+        0);
+}
diff --git a/runtime/parsec/codelets/codelet_ztpqrt.c b/runtime/parsec/codelets/codelet_ztpqrt.c
new file mode 100644
index 000000000..a0b3f6e06
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_ztpqrt.c
@@ -0,0 +1,85 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/parsec/include/morse_parsec.h"
+
+static int
+CORE_ztpqrt_parsec(dague_execution_unit_t    *context,
+                   dague_execution_context_t *this_task)
+{
+    int *M;
+    int *N;
+    int *L;
+    int *ib;
+    MORSE_Complex64_t *A;
+    int *lda;
+    MORSE_Complex64_t *B;
+    int *ldb;
+    MORSE_Complex64_t *T;
+    int *ldt;
+    MORSE_Complex64_t *WORK;
+
+    dague_dtd_unpack_args(
+        this_task,
+        UNPACK_VALUE,   &M,
+        UNPACK_VALUE,   &N,
+        UNPACK_VALUE,   &L,
+        UNPACK_VALUE,   &ib,
+        UNPACK_DATA,    &A,
+        UNPACK_VALUE,   &lda,
+        UNPACK_DATA,    &B,
+        UNPACK_VALUE,   &ldb,
+        UNPACK_DATA,    &T,
+        UNPACK_VALUE,   &ldt,
+        UNPACK_SCRATCH, &WORK );
+
+    CORE_ztpqrt( *M, *N, *L, *ib,
+                 A, *lda, B, *ldb, T, *ldt, WORK );
+
+    return 0;
+}
+
+void MORSE_TASK_ztpqrt( const MORSE_option_t *options,
+                         int M, int N, int L, int ib, int nb,
+                         const MORSE_desc_t *A, int Am, int An, int lda,
+                         const MORSE_desc_t *B, int Bm, int Bn, int ldb,
+                         const MORSE_desc_t *T, int Tm, int Tn, int ldt )
+{
+    dague_dtd_handle_t* DAGUE_dtd_handle = (dague_dtd_handle_t *)(options->sequence->schedopt);
+
+    dague_insert_task(
+        DAGUE_dtd_handle, CORE_ztpqrt_parsec, "tpqrt",
+        sizeof(int),   &M,   VALUE,
+        sizeof(int),   &N,   VALUE,
+        sizeof(int),   &L,   VALUE,
+        sizeof(int),   &ib,  VALUE,
+        PASSED_BY_REF,  RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | REGION_U | REGION_D,
+        sizeof(int),   &lda, VALUE,
+        PASSED_BY_REF,  RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT | REGION_FULL,
+        sizeof(int),   &ldb, VALUE,
+        PASSED_BY_REF,  RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INOUT | REGION_FULL,
+        sizeof(int),   &ldt, VALUE,
+        sizeof(MORSE_Complex64_t)*ib*nb, NULL, SCRATCH,
+        0);
+}
diff --git a/runtime/quark/CMakeLists.txt b/runtime/quark/CMakeLists.txt
index 9366a00d7..fa7952a15 100644
--- a/runtime/quark/CMakeLists.txt
+++ b/runtime/quark/CMakeLists.txt
@@ -86,77 +86,7 @@ set(RUNTIME_COMMON
 # ------------------------------------------------------
 set(RUNTIME_SRCS_GENERATED "")
 set(ZSRC
-    codelets/codelet_ztile_zero.c
-    codelets/codelet_zasum.c
-    ##################
-    # BLAS 1
-    ##################
-    codelets/codelet_zaxpy.c
-    ##################
-    # BLAS 3
-    ##################
-    codelets/codelet_zgemm.c
-    codelets/codelet_zhemm.c
-    codelets/codelet_zher2k.c
-    codelets/codelet_zherk.c
-    codelets/codelet_zsymm.c
-    codelets/codelet_zsyr2k.c
-    codelets/codelet_zsyrk.c
-    codelets/codelet_ztrmm.c
-    codelets/codelet_ztrsm.c
-    ##################
-    # LAPACK
-    ##################
-    codelets/codelet_zgeadd.c
-    codelets/codelet_zlascal.c
-    codelets/codelet_zgelqt.c
-    codelets/codelet_zgeqrt.c
-    codelets/codelet_zgessm.c
-    codelets/codelet_zgessq.c
-    codelets/codelet_zgetrf.c
-    codelets/codelet_zgetrf_incpiv.c
-    codelets/codelet_zgetrf_nopiv.c
-    codelets/codelet_zhe2ge.c
-    codelets/codelet_zherfb.c
-    codelets/codelet_zhessq.c
-    codelets/codelet_zlacpy.c
-    codelets/codelet_zlange.c
-    codelets/codelet_zlanhe.c
-    codelets/codelet_zlansy.c
-    codelets/codelet_zlantr.c
-    codelets/codelet_zlaset2.c
-    codelets/codelet_zlaset.c
-    codelets/codelet_zlatro.c
-    codelets/codelet_zlauum.c
-    codelets/codelet_zplghe.c
-    codelets/codelet_zplgsy.c
-    codelets/codelet_zplrnt.c
-    codelets/codelet_zplssq.c
-    codelets/codelet_zpotrf.c
-    codelets/codelet_zssssm.c
-    codelets/codelet_zsyssq.c
-    codelets/codelet_zsytrf_nopiv.c
-    codelets/codelet_ztradd.c
-    codelets/codelet_ztrasm.c
-    codelets/codelet_ztrssq.c
-    codelets/codelet_ztrtri.c
-    codelets/codelet_ztslqt.c
-    codelets/codelet_ztsmlq.c
-    codelets/codelet_ztsmqr.c
-    codelets/codelet_ztsmlq_hetra1.c
-    codelets/codelet_ztsmqr_hetra1.c
-    codelets/codelet_ztsqrt.c
-    codelets/codelet_ztstrf.c
-    codelets/codelet_zttlqt.c
-    codelets/codelet_zttmlq.c
-    codelets/codelet_zttmqr.c
-    codelets/codelet_zttqrt.c
-    codelets/codelet_zunmlq.c
-    codelets/codelet_zunmqr.c
-    ##################
-    # BUILD
-    ##################
-    codelets/codelet_zbuild.c
+    ${CODELETS_ZSRC}
     )
 
 precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
diff --git a/runtime/quark/codelets/codelet_ztpmqrt.c b/runtime/quark/codelets/codelet_ztpmqrt.c
new file mode 100644
index 000000000..25bd5ac83
--- /dev/null
+++ b/runtime/quark/codelets/codelet_ztpmqrt.c
@@ -0,0 +1,84 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/quark/include/morse_quark.h"
+
+static void
+CORE_ztpmqrt_quark( Quark *quark )
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int M;
+    int N;
+    int K;
+    int L;
+    int ib;
+    const MORSE_Complex64_t *V;
+    int ldv;
+    const MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *WORK;
+
+    quark_unpack_args_16( quark, side, trans, M, N, K, L, ib,
+                          V, ldv, T, ldt, A, lda, B, ldb, WORK );
+
+    CORE_ztpmqrt( side, trans, M, N, K, L, ib,
+                  V, ldv, T, ldt, A, lda, B, ldb, WORK );
+}
+
+void MORSE_TASK_ztpmqrt( const MORSE_option_t *options,
+                         MORSE_enum side, MORSE_enum trans,
+                         int M, int N, int K, int L, int ib, int nb,
+                         const MORSE_desc_t *V, int Vm, int Vn, int ldv,
+                         const MORSE_desc_t *T, int Tm, int Tn, int ldt,
+                         const MORSE_desc_t *A, int Am, int An, int lda,
+                         const MORSE_desc_t *B, int Bm, int Bn, int ldb )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_TSMQR;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_ztpmqrt_quark, (Quark_Task_Flags*)opt,
+        sizeof(MORSE_enum),              &side,  VALUE,
+        sizeof(MORSE_enum),              &trans, VALUE,
+        sizeof(int),                     &M,     VALUE,
+        sizeof(int),                     &N,     VALUE,
+        sizeof(int),                     &K,     VALUE,
+        sizeof(int),                     &L,     VALUE,
+        sizeof(int),                     &ib,    VALUE,
+        sizeof(MORSE_Complex64_t)*nb*nb,  RTBLKADDR( V, MORSE_Complex64_t, Vm, Vn ), INPUT,
+        sizeof(int),                     &ldv,   VALUE,
+        sizeof(MORSE_Complex64_t)*ib*nb,  RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), INPUT,
+        sizeof(int),                     &ldt,   VALUE,
+        sizeof(MORSE_Complex64_t)*nb*nb,  RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT,
+        sizeof(int),                     &lda,   VALUE,
+        sizeof(MORSE_Complex64_t)*nb*nb,  RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT,
+        sizeof(int),                     &ldb,   VALUE,
+        sizeof(MORSE_Complex64_t)*ib*nb,  NULL, SCRATCH,
+        0);
+}
diff --git a/runtime/quark/codelets/codelet_ztpqrt.c b/runtime/quark/codelets/codelet_ztpqrt.c
new file mode 100644
index 000000000..9b7e09876
--- /dev/null
+++ b/runtime/quark/codelets/codelet_ztpqrt.c
@@ -0,0 +1,72 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/quark/include/morse_quark.h"
+
+static void
+CORE_ztpqrt_quark( Quark *quark )
+{
+    int M;
+    int N;
+    int L;
+    int ib;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *WORK;
+
+    quark_unpack_args_11( quark, M, N, L, ib,
+                          A, lda, B, ldb, T, ldt, WORK );
+
+    CORE_ztpqrt( M, N, L, ib,
+                 A, lda, B, ldb, T, ldt, WORK );
+}
+
+void MORSE_TASK_ztpqrt( const MORSE_option_t *options,
+                         int M, int N, int L, int ib, int nb,
+                         const MORSE_desc_t *A, int Am, int An, int lda,
+                         const MORSE_desc_t *B, int Bm, int Bn, int ldb,
+                         const MORSE_desc_t *T, int Tm, int Tn, int ldt )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_TSQRT;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_ztpqrt_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),                         &M,   VALUE,
+        sizeof(int),                         &N,   VALUE,
+        sizeof(int),                         &L,   VALUE,
+        sizeof(int),                         &ib,  VALUE,
+        sizeof(MORSE_Complex64_t)*nb*nb,      RTBLKADDR( A, MORSE_Complex64_t, Am, An ), INOUT | QUARK_REGION_U | QUARK_REGION_D,
+        sizeof(int),                         &lda, VALUE,
+        sizeof(MORSE_Complex64_t)*nb*nb,      RTBLKADDR( B, MORSE_Complex64_t, Bm, Bn ), INOUT,
+        sizeof(int),                         &ldb, VALUE,
+        sizeof(MORSE_Complex64_t)*nb*ib,      RTBLKADDR( T, MORSE_Complex64_t, Tm, Tn ), OUTPUT,
+        sizeof(int),                         &ldt, VALUE,
+        sizeof(MORSE_Complex64_t)*(ib+1)*nb,  NULL, SCRATCH,
+        0);
+}
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 08956acaf..b2748379d 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -106,77 +106,7 @@ set_source_files_properties(control/runtime_profiling.c PROPERTIES COMPILE_FLAGS
 set(RUNTIME_SRCS_GENERATED "")
 set(ZSRC
     codelets/codelet_zcallback.c
-    codelets/codelet_ztile_zero.c
-    codelets/codelet_zasum.c
-    ##################
-    # BLAS 1
-    ##################
-    codelets/codelet_zaxpy.c
-    ##################
-    # BLAS 3
-    ##################
-    codelets/codelet_zgemm.c
-    codelets/codelet_zhemm.c
-    codelets/codelet_zher2k.c
-    codelets/codelet_zherk.c
-    codelets/codelet_zsymm.c
-    codelets/codelet_zsyr2k.c
-    codelets/codelet_zsyrk.c
-    codelets/codelet_ztrmm.c
-    codelets/codelet_ztrsm.c
-    ##################
-    # LAPACK
-    ##################
-    codelets/codelet_zgeadd.c
-    codelets/codelet_zlascal.c
-    codelets/codelet_zgelqt.c
-    codelets/codelet_zgeqrt.c
-    codelets/codelet_zgessm.c
-    codelets/codelet_zgessq.c
-    codelets/codelet_zgetrf.c
-    codelets/codelet_zgetrf_incpiv.c
-    codelets/codelet_zgetrf_nopiv.c
-    codelets/codelet_zhe2ge.c
-    codelets/codelet_zherfb.c
-    codelets/codelet_zhessq.c
-    codelets/codelet_zlacpy.c
-    codelets/codelet_zlange.c
-    codelets/codelet_zlanhe.c
-    codelets/codelet_zlansy.c
-    codelets/codelet_zlantr.c
-    codelets/codelet_zlaset2.c
-    codelets/codelet_zlaset.c
-    codelets/codelet_zlatro.c
-    codelets/codelet_zlauum.c
-    codelets/codelet_zplghe.c
-    codelets/codelet_zplgsy.c
-    codelets/codelet_zplrnt.c
-    codelets/codelet_zplssq.c
-    codelets/codelet_zpotrf.c
-    codelets/codelet_zssssm.c
-    codelets/codelet_zsyssq.c
-    codelets/codelet_zsytrf_nopiv.c
-    codelets/codelet_ztradd.c
-    codelets/codelet_ztrasm.c
-    codelets/codelet_ztrssq.c
-    codelets/codelet_ztrtri.c
-    codelets/codelet_ztslqt.c
-    codelets/codelet_ztsmlq.c
-    codelets/codelet_ztsmqr.c
-    codelets/codelet_ztsmlq_hetra1.c
-    codelets/codelet_ztsmqr_hetra1.c
-    codelets/codelet_ztsqrt.c
-    codelets/codelet_ztstrf.c
-    codelets/codelet_zttlqt.c
-    codelets/codelet_zttmlq.c
-    codelets/codelet_zttmqr.c
-    codelets/codelet_zttqrt.c
-    codelets/codelet_zunmlq.c
-    codelets/codelet_zunmqr.c
-    ##################
-    # BUILD
-    ##################
-    codelets/codelet_zbuild.c
+    ${CODELETS_ZSRC}
     )
 
 precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 8af4ec546..bb26aa301 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -67,7 +67,9 @@ CHAMELEON_CL_CB(zssssm,        starpu_matrix_get_nx(task->handles[0]), starpu_ma
 CHAMELEON_CL_CB(zsymm,         starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0,                                           2.*M*M *N);
 CHAMELEON_CL_CB(zsyr2k,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      ( 1.+2.*M*N)*M);
 CHAMELEON_CL_CB(zsyrk,         starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      ( 1.+   M)*M*N);
-CHAMELEON_CL_CB(ztrasm,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                               0.5*M*(M+1));
+CHAMELEON_CL_CB(ztpqrt,        starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), starpu_matrix_get_nx(task->handles[0]),       2.*M*N*K);
+CHAMELEON_CL_CB(ztpmqrt,       starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), starpu_matrix_get_nx(task->handles[2]),       4.*M*N*K);
+CHAMELEON_CL_CB(ztrasm,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                         0.5*M*(M+1));
 CHAMELEON_CL_CB(ztrmm,         starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0,                                               M*M*N);
 CHAMELEON_CL_CB(ztrsm,         starpu_matrix_get_nx(task->handles[1]), starpu_matrix_get_ny(task->handles[1]), 0,                                               M*M*N);
 CHAMELEON_CL_CB(ztrtri,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (1./3.)*M *M*M);
diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c
new file mode 100644
index 000000000..98188588e
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_ztpmqrt.c
@@ -0,0 +1,159 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpmqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/starpu/include/morse_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void MORSE_TASK_ztpmqrt( const MORSE_option_t *options,
+                         MORSE_enum side, MORSE_enum trans,
+                         int M, int N, int K, int L, int ib, int nb,
+                         const MORSE_desc_t *V, int Vm, int Vn, int ldv,
+                         const MORSE_desc_t *T, int Tm, int Tn, int ldt,
+                         const MORSE_desc_t *A, int Am, int An, int lda,
+                         const MORSE_desc_t *B, int Bm, int Bn, int ldb )
+{
+    struct starpu_codelet *codelet = &cl_ztpmqrt;
+    void (*callback)(void*) = options->profiling ? cl_ztpmqrt_callback : NULL;
+
+    if ( morse_desc_islocal( A, Am, An ) ||
+         morse_desc_islocal( B, Bm, Bn ) ||
+         morse_desc_islocal( V, Vm, Vn ) ||
+         morse_desc_islocal( T, Tm, Tn ) )
+    {
+        starpu_insert_task(
+            codelet,
+            STARPU_VALUE, &side,  sizeof(MORSE_enum),
+            STARPU_VALUE, &trans, sizeof(MORSE_enum),
+            STARPU_VALUE, &M,     sizeof(int),
+            STARPU_VALUE, &N,     sizeof(int),
+            STARPU_VALUE, &K,     sizeof(int),
+            STARPU_VALUE, &L,     sizeof(int),
+            STARPU_R,      RTBLKADDR(V, MORSE_Complex64_t, Vm, Vn),
+            STARPU_VALUE, &ldv,   sizeof(int),
+            STARPU_R,      RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn),
+            STARPU_VALUE, &ldt,   sizeof(int),
+            STARPU_RW,     RTBLKADDR(A, MORSE_Complex64_t, Am, An),
+            STARPU_VALUE, &lda,   sizeof(int),
+            STARPU_RW,     RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn),
+            STARPU_VALUE, &ldb,   sizeof(int),
+            /* Other options */
+            STARPU_SCRATCH,   options->ws_worker,
+            STARPU_PRIORITY,  options->priority,
+            STARPU_CALLBACK,  callback,
+#if defined(CHAMELEON_USE_MPI)
+            STARPU_EXECUTE_ON_NODE, execution_rank,
+#endif
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+            STARPU_NAME, "ztpmqrt",
+#endif
+            0);
+    }
+}
+
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_ztpmqrt_cpu_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int M;
+    int N;
+    int K;
+    int L;
+    int ib;
+    const MORSE_Complex64_t *V;
+    int ldv;
+    const MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *WORK;
+
+    V    = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T    = (const MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    A    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+    B    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]);
+    WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[4]); /* ib * nb */
+
+    starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
+                                &ldv, &ldt, &lda, &ldb );
+
+    CORE_ztpmqrt( side, trans, M, N, K, L, ib,
+                  V, ldv, T, ldt, A, lda, B, ldb, WORK );
+}
+
+
+#if defined(CHAMELEON_USE_CUDA)
+static void cl_ztpmqrt_cuda_func(void *descr[], void *cl_arg)
+{
+    MORSE_enum side;
+    MORSE_enum trans;
+    int M;
+    int N;
+    int K;
+    int L;
+    int k;
+    int ib;
+    const cuDoubleComplex *V;
+    int ldv;
+    const cuDoubleComplex *T;
+    int ldt;
+    cuDoubleComplex *A;
+    int lda;
+    cuDoubleComplex *B;
+    int ldb;
+    cuDoubleComplex *W;
+    CUstream stream;
+
+    V = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[0]);
+    T = (const cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[1]);
+    A = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[2]);
+    B = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[3]);
+    W = (cuDoubleComplex *)STARPU_MATRIX_GET_PTR(descr[4]); /* 2*ib*nb */
+
+    starpu_codelet_unpack_args( cl_arg, &side, &trans, &M, &N, &K, &L, &ib,
+                                &ldv, &ldt, &lda, &ldb );
+
+    stream = starpu_cuda_get_local_stream();
+    cublasSetKernelStream( stream );
+
+    CUDA_ztpmqrt(
+            side, trans, M, N, K, L, ib,
+            A, lda, B, ldb, V, ldv, T, ldt,
+            W, stream );
+
+#ifndef STARPU_CUDA_ASYNC
+    cudaStreamSynchronize( stream );
+#endif
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+
+/*
+ * Codelet definition
+ */
+CODELETS(ztpmqrt, 5, cl_ztpmqrt_cpu_func, cl_ztpmqrt_cuda_func, STARPU_CUDA_ASYNC)
diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c
new file mode 100644
index 000000000..b6da13320
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_ztpqrt.c
@@ -0,0 +1,99 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file codelet_ztpqrt.c
+ *
+ *  MORSE codelets kernel
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "runtime/starpu/include/morse_starpu.h"
+#include "runtime/starpu/include/runtime_codelet_z.h"
+
+void MORSE_TASK_ztpqrt( const MORSE_option_t *options,
+                        int M, int N, int L, int ib, int nb,
+                        const MORSE_desc_t *A, int Am, int An, int lda,
+                        const MORSE_desc_t *B, int Bm, int Bn, int ldb,
+                        const MORSE_desc_t *T, int Tm, int Tn, int ldt )
+{
+    struct starpu_codelet *codelet = &cl_ztpqrt;
+    void (*callback)(void*) = options->profiling ? cl_ztpqrt_callback : NULL;
+
+    if ( morse_desc_islocal( A, Am, An ) ||
+         morse_desc_islocal( B, Bm, Bn ) ||
+         morse_desc_islocal( T, Tm, Tn ) )
+    {
+        starpu_insert_task(
+            codelet,
+            STARPU_VALUE, &M,     sizeof(int),
+            STARPU_VALUE, &N,     sizeof(int),
+            STARPU_VALUE, &L,     sizeof(int),
+            STARPU_RW,     RTBLKADDR(A, MORSE_Complex64_t, Am, An),
+            STARPU_VALUE, &lda,   sizeof(int),
+            STARPU_RW,     RTBLKADDR(B, MORSE_Complex64_t, Bm, Bn),
+            STARPU_VALUE, &ldb,   sizeof(int),
+            STARPU_RW,     RTBLKADDR(T, MORSE_Complex64_t, Tm, Tn),
+            STARPU_VALUE, &ldt,   sizeof(int),
+            /* Other options */
+            STARPU_SCRATCH,   options->ws_worker,
+            STARPU_PRIORITY,  options->priority,
+            STARPU_CALLBACK,  callback,
+#if defined(CHAMELEON_USE_MPI)
+            STARPU_EXECUTE_ON_NODE, execution_rank,
+#endif
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+            STARPU_NAME, "ztpqrt",
+#endif
+            0);
+    }
+}
+
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_ztpqrt_cpu_func(void *descr[], void *cl_arg)
+{
+    int M;
+    int N;
+    int L;
+    int ib;
+    MORSE_Complex64_t *A;
+    int lda;
+    MORSE_Complex64_t *B;
+    int ldb;
+    MORSE_Complex64_t *T;
+    int ldt;
+    MORSE_Complex64_t *WORK;
+
+    A    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
+    T    = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[2]);
+    WORK = (MORSE_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[3]); /* ib * nb */
+
+    starpu_codelet_unpack_args( cl_arg, &M, &N, &L, &ib,
+                                &lda, &ldb, &ldt );
+
+    CORE_ztpqrt( M, N, L, ib,
+                 A, lda, B, ldb, T, ldt, WORK );
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(ztpqrt, 4, cl_ztpqrt_cpu_func)
diff --git a/runtime/starpu/include/runtime_codelet_profile.h b/runtime/starpu/include/runtime_codelet_profile.h
index 99303041a..67942fc01 100644
--- a/runtime/starpu/include/runtime_codelet_profile.h
+++ b/runtime/starpu/include/runtime_codelet_profile.h
@@ -119,6 +119,6 @@
     extern struct starpu_perfmodel cl_##name##_fake;    \
     void cl_##name##_callback();                        \
     void profiling_display_##name##_info(void);         \
-    void estimate_##name##_sustained_peak(double *res);
+    void estimate_##name##_sustained_peak(double *res)
 
 #endif /* __CODELET_PROFILE_H__ */
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 0da29addb..16de7ea01 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -73,6 +73,8 @@ ZCODELETS_HEADER(syssq)
 ZCODELETS_HEADER(trasm)
 ZCODELETS_HEADER(trssq)
 ZCODELETS_HEADER(trtri)
+ZCODELETS_HEADER(tpqrt)
+ZCODELETS_HEADER(tpmqrt)
 ZCODELETS_HEADER(tslqt)
 ZCODELETS_HEADER(tsmlq)
 ZCODELETS_HEADER(tsmqr)
diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h
index cf0a3bb31..14b1c8e56 100644
--- a/runtime/starpu/include/runtime_codelets.h
+++ b/runtime/starpu/include/runtime_codelets.h
@@ -87,7 +87,7 @@
 
 
 #define CODELETS_ALL_HEADER(name)                                             \
-     CHAMELEON_CL_CB_HEADER(name)                                             \
+     CHAMELEON_CL_CB_HEADER(name);                                            \
      void cl_##name##_load_fake_model(void);                                  \
      void cl_##name##_restore_model(void);                                    \
      extern struct starpu_codelet cl_##name;                                  \
diff --git a/runtime/starpu/include/runtime_workspace.h b/runtime/starpu/include/runtime_workspace.h
index e1bd1859d..a7d25d38e 100644
--- a/runtime/starpu/include/runtime_workspace.h
+++ b/runtime/starpu/include/runtime_workspace.h
@@ -26,10 +26,10 @@
 #ifndef _MORSE_STARPU_WORKSPACE_H_
 #define _MORSE_STARPU_WORKSPACE_H_
 
-/* 
- * Allocate workspace in host memory: CPU for any worker 
+/*
+ * Allocate workspace in host memory: CPU for any worker
  * or allocate workspace in worker's memory: main memory for cpu workers,
- * and embedded memory for CUDA devices. 
+ * and embedded memory for CUDA devices.
  */
 #define MORSE_HOST_MEM    0
 #define MORSE_WORKER_MEM  1
@@ -48,7 +48,7 @@ typedef struct morse_starpu_ws_s MORSE_starpu_ws_t;
  * (eg. MORSE_CUDA|MORSE_CPU for all CPU and GPU workers).  The
  * memory_location argument indicates whether this should be a buffer in host
  * memory or in worker's memory (MORSE_HOST_MEM or MORSE_WORKER_MEM). This function
- * returns 0 upon successful completion. 
+ * returns 0 upon successful completion.
  */
 int   RUNTIME_starpu_ws_alloc   ( MORSE_starpu_ws_t **workspace, size_t size, int which_workers, int memory_location);
 int   RUNTIME_starpu_ws_free    ( MORSE_starpu_ws_t  *workspace);
-- 
GitLab


From dfc3fae8bfee9c6810095f89f6c3237fbf177174 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 21 Dec 2016 00:18:46 +0100
Subject: [PATCH 6/8] Cleanup and silent warnings

---
 compute/pztpqrt.c                         | 19 -------------------
 compute/ztpqrt.c                          |  7 +++++--
 coreblas/compute/core_ztpmqrt.c           |  6 +++---
 runtime/starpu/codelets/codelet_ztpmqrt.c |  2 ++
 runtime/starpu/codelets/codelet_ztpqrt.c  |  2 ++
 5 files changed, 12 insertions(+), 24 deletions(-)

diff --git a/compute/pztpqrt.c b/compute/pztpqrt.c
index 8dd8c6335..0b825d5c2 100644
--- a/compute/pztpqrt.c
+++ b/compute/pztpqrt.c
@@ -27,11 +27,6 @@
 #define A(m,n) A,  m,  n
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization - dynamic scheduling
@@ -43,7 +38,6 @@ void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldbm;
@@ -101,12 +95,6 @@ void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb, A->p, A->q);
-#endif
-
     for (k = 0; k < A->nt; k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -141,11 +129,4 @@ void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/ztpqrt.c b/compute/ztpqrt.c
index ef9e4232e..6efe33bbb 100644
--- a/compute/ztpqrt.c
+++ b/compute/ztpqrt.c
@@ -215,7 +215,8 @@ int MORSE_ztpqrt( int M, int N, int L,
     return status;
 }
 
-/***************************************************************************//**
+/**
+ *******************************************************************************
  *
  * @ingroup MORSE_Complex64_t_Tile
  *
@@ -268,6 +269,7 @@ int MORSE_ztpqrt_Tile( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T
     morse_sequence_create(morse, &sequence);
     MORSE_ztpqrt_Tile_Async(L, A, B, T, sequence, &request);
     morse_sequence_wait(morse, sequence);
+    RUNTIME_desc_getoncpu(A);
     RUNTIME_desc_getoncpu(B);
 
     status = sequence->status;
@@ -275,7 +277,8 @@ int MORSE_ztpqrt_Tile( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T
     return status;
 }
 
-/***************************************************************************//**
+/**
+ *******************************************************************************
  *
  * @ingroup MORSE_Complex64_t_Tile_Async
  *
diff --git a/coreblas/compute/core_ztpmqrt.c b/coreblas/compute/core_ztpmqrt.c
index 2241b5d39..b9addb167 100644
--- a/coreblas/compute/core_ztpmqrt.c
+++ b/coreblas/compute/core_ztpmqrt.c
@@ -46,7 +46,7 @@
  * @param[in] M
  *         The number of rows of the tile B. M >= 0.
  *
- * @param[in] N1
+ * @param[in] N
  *         The number of columns of the tile B. N >= 0.
  *
  * @param[in] K
@@ -63,7 +63,7 @@
  * @param[in] V
  *         The i-th row must contain the vector which defines the
  *         elementary reflector H(i), for i = 1,2,...,k, as returned by
- *         CORE_ZTTQRT in the first k rows of its array argument V.
+ *         CORE_ZTPQRT in the first k rows of its array argument V.
  *
  * @param[in] LDV
  *         The leading dimension of the array V. LDV >= max(1,K).
@@ -84,7 +84,7 @@
  *         Q*C or Q**H*C or C*Q or C*Q**H.  See Further Details.
  *
  * @param[in] LDA
- *         The leading dimension of the array A. LDA1 >= max(1,M1).
+ *         The leading dimension of the array A. LDA >= max(1,M).
  *         If side = MorseLeft,  LDA >= max(1,K);
  *         If side = Morseright, LDA >= max(1,M).
  *
diff --git a/runtime/starpu/codelets/codelet_ztpmqrt.c b/runtime/starpu/codelets/codelet_ztpmqrt.c
index 98188588e..7afc9e1bf 100644
--- a/runtime/starpu/codelets/codelet_ztpmqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpmqrt.c
@@ -69,6 +69,8 @@ void MORSE_TASK_ztpmqrt( const MORSE_option_t *options,
 #endif
             0);
     }
+
+    (void)ib; (void)nb;
 }
 
 
diff --git a/runtime/starpu/codelets/codelet_ztpqrt.c b/runtime/starpu/codelets/codelet_ztpqrt.c
index b6da13320..06ee745ed 100644
--- a/runtime/starpu/codelets/codelet_ztpqrt.c
+++ b/runtime/starpu/codelets/codelet_ztpqrt.c
@@ -61,6 +61,8 @@ void MORSE_TASK_ztpqrt( const MORSE_option_t *options,
 #endif
             0);
     }
+
+    (void)ib; (void)nb;
 }
 
 
-- 
GitLab


From 0dd5b28803c5aae948aa81ca3586e953379587c4 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 21 Dec 2016 00:20:41 +0100
Subject: [PATCH 7/8] Add ztpgqrt function to generate Q from ztpqrt

---
 compute/CMakeLists.txt |   2 +
 compute/pztpgqrt.c     | 106 ++++++++++++
 compute/ztpgqrt.c      | 371 +++++++++++++++++++++++++++++++++++++++++
 control/compute_z.h    |   1 +
 include/morse_z.h      |   3 +
 5 files changed, 483 insertions(+)
 create mode 100644 compute/pztpgqrt.c
 create mode 100644 compute/ztpgqrt.c

diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 8d61cde93..9dcc70224 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -130,6 +130,7 @@ set(ZSRC
     pzunmlqrh.c
     pzunmqr.c
     pzunmqrrh.c
+    pztpgqrt.c
     pztpqrt.c
     ###
     zgels.c
@@ -168,6 +169,7 @@ set(ZSRC
     zungqr.c
     zunmlq.c
     zunmqr.c
+    ztpgqrt.c
     ztpqrt.c
     ##################
     # MIXED PRECISION
diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c
new file mode 100644
index 000000000..723dbf369
--- /dev/null
+++ b/compute/pztpgqrt.c
@@ -0,0 +1,106 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file pztpgqrt.c
+ *
+ *  MORSE computational routines
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+#define V(m,n) V,  m,  n
+#define T(m,n) T,  m,  n
+#define A(m,n) A,  m,  n
+#define B(m,n) B,  m,  n
+
+/***************************************************************************//**
+ *  Parallel tile QR factorization - dynamic scheduling
+ **/
+void morse_pztpgqrt( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B,
+                     MORSE_sequence_t *sequence, MORSE_request_t *request )
+{
+    MORSE_context_t *morse;
+    MORSE_option_t options;
+    size_t ws_worker = 0;
+    size_t ws_host = 0;
+
+    int k, m, n;
+    int ldak, ldvm, ldbm;
+    int tempkn, tempnn, tempmm, templm;
+    int ib;
+
+    /* Dimension of the first column */
+    int maxm  = B->m - L;
+    int maxmt = (maxm % B->mb == 0) ? (maxm / B->mb) : (maxm / B->mb + 1);
+    int maxmtk;
+
+    morse = morse_context_self();
+    if (sequence->status != MORSE_SUCCESS)
+        return;
+    RUNTIME_options_init(&options, morse, sequence, request);
+
+    ib = MORSE_IB;
+
+    /*
+     * ztsmqr = A->nb * ib
+     */
+    ws_worker = A->nb * ib;
+
+    /* Allocation of temporary (scratch) working space */
+#if defined(CHAMELEON_USE_CUDA)
+    /* Worker space
+     *
+     * ztsmqr = 2 * A->nb * ib
+     */
+    ws_worker = max( ws_worker, ib * A->nb * 2 );
+#endif
+
+    ws_worker *= sizeof(MORSE_Complex64_t);
+    ws_host   *= sizeof(MORSE_Complex64_t);
+
+    RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
+
+    for (k = V->nt-1; k >= 0; k--) {
+        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+        ldak = BLKLDD(A, k);
+
+        maxmtk = min( B->mt, maxmt+k ) - 1;
+        for (m = maxmtk; m > -1; m--) {
+            tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
+            templm = m == maxmtk  ? tempmm       : 0;
+            ldvm = BLKLDD(V, m);
+            ldbm = BLKLDD(B, m);
+
+            for (n = k; n < B->nt; n++) {
+                tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
+                MORSE_TASK_ztpmqrt(
+                    &options,
+                    MorseLeft, MorseConjTrans,
+                    tempmm, tempnn, tempkn, templm, ib, T->nb,
+                    V(m, k), ldvm,
+                    T(m, k), T->mb,
+                    A(k, n), ldak,
+                    B(m, n), ldbm );
+            }
+        }
+    }
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, morse);
+    MORSE_TASK_dataflush_all();
+}
diff --git a/compute/ztpgqrt.c b/compute/ztpgqrt.c
new file mode 100644
index 000000000..6a4ef6d7c
--- /dev/null
+++ b/compute/ztpgqrt.c
@@ -0,0 +1,371 @@
+/**
+ *
+ * @copyright (c) 2009-2016 The University of Tennessee and The University
+ *                          of Tennessee Research Foundation.
+ *                          All rights reserved.
+ * @copyright (c) 2012-2016 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                          Univ. Bordeaux. All rights reserved.
+ *
+ **/
+
+/**
+ *
+ * @file ztpgqrt.c
+ *
+ *  MORSE computational routines
+ *  MORSE is a software package provided by Univ. of Tennessee,
+ *  Univ. of California Berkeley and Univ. of Colorado Denver
+ *
+ * @version 0.9.0
+ * @author Mathieu Faverge
+ * @date 2016-12-15
+ * @precisions normal z -> s d c
+ *
+ **/
+#include "control/common.h"
+
+/**
+ ******************************************************************************
+ *
+ * @ingroup MORSE_Complex64_t
+ *
+ *  MORSE_ztpgqrt - Generates a partial Q matrix formed with a blocked QR
+ *  factorization of a "triangular-pentagonal" matrix C, which is composed of an
+ *  unused triangular block and a pentagonal block V, using the compact
+ *  representation for Q. See MORSE_ztpqrt() to generate V.
+ *
+ *******************************************************************************
+ *
+ * @param[in] M
+ *          The number of rows of the matrices B, and V. M >= 0.
+ *
+ * @param[in] N
+ *          The number of columns of the matrices B, and A. N >= 0.
+ *
+ * @param[in] K
+ *          The number of elementary reflectors whose product defines
+ *          the matrix Q in the matrix V.
+ *
+ * @param[in] L
+ *          The number of rows of the upper trapezoidal part of V.
+ *          MIN(M,N) >= L >= 0.  See Further Details.
+ *
+ * @param[in] V
+ *          The i-th row must contain the vector which defines the
+ *          elementary reflector H(i), for i = 1,2,...,k, as returned by
+ *          MORSE_ztpqrt() in the first k rows of its array argument V.
+ *          V is matrx of size M-by-K. The first M-L rows
+ *          are rectangular, and the last L rows are upper trapezoidal.
+ *
+ * @param[in] LDV
+ *          The leading dimension of the array V. LDV >= max(1,K).
+ *
+ * @param[int] descT
+ *          On exit, auxiliary factorization data, required by MORSE_zgeqrs to
+ *          solve the system of equations, or by any function to apply the Q.
+ *
+ * @param[in,out] A
+ *          A is COMPLEX*16 array, dimension (LDA,N)
+ *          On entry, the K-by-N matrix A.
+ *          On exit, A is overwritten by the corresponding block of
+ *          Q*A.  See Further Details.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,K).
+ *
+ * @param[in,out] B
+ *          On entry, the pentagonal M-by-N matrix B.
+ *          On exit, B contains Q.
+ *
+ * @param[in] LDB
+ *          The leading dimension of the array B.  LDB >= max(1,M).
+ *
+ * @par Further Details:
+ * =====================
+ *
+ *  The input matrix Q is a (K+M)-by-N matrix
+ *
+ *               Q = [ A ]
+ *                   [ B ]
+ *
+ *  where A is an identity matrix, and B is a M-by-N matrix of 0.
+ *  V a matrix of householder reflectors with a pentagonal shape consisting of a
+ *  (M-L)-by-K rectangular matrix V1 on top of a L-by-N
+ *  Upper trapezoidal matrix V2:
+ *
+ *               V = [ V1 ]  <- (M-L)-by-N rectangular
+ *                   [ V2 ]  <-     L-by-N upper trapezoidal.
+ *
+ *  The upper trapezoidal matrix V2 consists of the first L rows of a
+ *  K-by-K upper triangular matrix, where 0 <= L <= MIN(M,K).  If L=0,
+ *  V is rectangular M-by-K; if M=L=K, V is upper triangular.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *          \retval <0 if -i, the i-th argument had an illegal value
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpgqrt_Tile
+ * @sa MORSE_ztpgqrt_Tile_Async
+ * @sa MORSE_ctpgqrt
+ * @sa MORSE_dtpgqrt
+ * @sa MORSE_stpgqrt
+ * @sa MORSE_zgeqrs
+ *
+ ******************************************************************************/
+int MORSE_ztpgqrt( int M, int N, int K, int L,
+                   MORSE_Complex64_t *V, int LDV,
+                   MORSE_desc_t *descT,
+                   MORSE_Complex64_t *A, int LDA,
+                   MORSE_Complex64_t *B, int LDB )
+{
+    int NB;
+    int status;
+    MORSE_context_t *morse;
+    MORSE_sequence_t *sequence = NULL;
+    MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
+    MORSE_desc_t descA, descB, descV;
+    int minMK = min( M, K );
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_fatal_error("MORSE_ztpgqrt", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+
+    /* Check input arguments */
+    if (M < 0) {
+        morse_error("MORSE_ztpgqrt", "illegal value of M");
+        return -1;
+    }
+    if (N < 0) {
+        morse_error("MORSE_ztpgqrt", "illegal value of N");
+        return -2;
+    }
+    if (K < 0) {
+        morse_error("MORSE_ztpgqrt", "illegal value of K");
+        return -3;
+    }
+    if ((L < 0) || ((L > minMK) && (minMK > 0))) {
+        morse_error("MORSE_ztpgqrt", "illegal value of N");
+        return -4;
+    }
+    if (LDV < max(1, M)) {
+        morse_error("MORSE_ztpgqrt", "illegal value of LDV");
+        return -6;
+    }
+    if (LDA < max(1, K)) {
+        morse_error("MORSE_ztpgqrt", "illegal value of LDA");
+        return -9;
+    }
+    if (LDB < max(1, M)) {
+        morse_error("MORSE_ztpgqrt", "illegal value of LDB");
+        return -11;
+    }
+
+    /* Quick return */
+    if (minMK == 0)
+        return MORSE_SUCCESS;
+
+    /* Tune NB & IB depending on M, N & NRHS; Set NBNBSIZE */
+    status = morse_tune(MORSE_FUNC_ZGELS, M, K, 0);
+    if (status != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpgqrt", "morse_tune() failed");
+        return status;
+    }
+
+    /* Set NT */
+    NB = MORSE_NB;
+
+    morse_sequence_create(morse, &sequence);
+
+/*    if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) {*/
+        morse_zooplap2tile( descV, V, NB, NB, LDB, K, 0, 0, M, K, sequence, &request,
+                            morse_desc_mat_free(&(descV)) );
+        morse_zooplap2tile( descA, A, NB, NB, LDA, N, 0, 0, K, N, sequence, &request,
+                            (morse_desc_mat_free(&(descV)),
+                             morse_desc_mat_free(&(descA))) );
+        morse_zooplap2tile( descB, B, NB, NB, LDB, N, 0, 0, M, N, sequence, &request,
+                            (morse_desc_mat_free(&(descV)),
+                             morse_desc_mat_free(&(descA)),
+                             morse_desc_mat_free(&(descB))) );
+/*    } else {*/
+/*        morse_ziplap2tile( descA, A, NB, NB, LDA, N, 0, 0, M, N,*/
+/*                            sequence, &request);*/
+/*    }*/
+
+    /* Call the tile interface */
+    MORSE_ztpgqrt_Tile_Async(L, &descV, descT, &descA, &descB, sequence, &request);
+
+/*    if ( MORSE_TRANSLATION == MORSE_OUTOFPLACE ) {*/
+        morse_zooptile2lap(descA, A, NB, NB, LDA, N, sequence, &request);
+        morse_zooptile2lap(descB, B, NB, NB, LDB, N, sequence, &request);
+        morse_sequence_wait(morse, sequence);
+        morse_desc_mat_free(&descV);
+        morse_desc_mat_free(&descA);
+        morse_desc_mat_free(&descB);
+/*    } else {*/
+/*        morse_ziptile2lap( descV, V, NB, NB, LDV, K,  sequence, &request);*/
+/*        morse_ziptile2lap( descA, A, NB, NB, LDA, N,  sequence, &request);*/
+/*        morse_ziptile2lap( descB, B, NB, NB, LDB, N,  sequence, &request);*/
+/*        morse_sequence_wait(morse, sequence);*/
+/*    }*/
+
+    status = sequence->status;
+    morse_sequence_destroy(morse, sequence);
+    return status;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup MORSE_Complex64_t_Tile
+ *
+ *  MORSE_ztpgqrt_Tile - Generates a partial Q matrix formed with a blocked QR
+ *  factorization of a "triangular-pentagonal" matrix C, which is composed of an
+ *  unused triangular block and a pentagonal block V, using the compact
+ *  representation for Q. See MORSE_ztpqrt() to generate V.
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] A
+ *          On entry, the M-by-N matrix A.
+ *          On exit, the elements on and above the diagonal of the array contain the min(M,N)-by-N
+ *          upper trapezoidal matrix R (R is upper triangular if M >= N); the elements below the
+ *          diagonal represent the unitary matrix Q as a product of elementary reflectors stored
+ *          by tiles.
+ *
+ * @param[out] T
+ *          On exit, auxiliary factorization data, required by MORSE_zgeqrs to solve the system
+ *          of equations.
+ *
+ *******************************************************************************
+ *
+ * @return
+ *          \retval MORSE_SUCCESS successful exit
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpgqrt
+ * @sa MORSE_ztpgqrt_Tile_Async
+ * @sa MORSE_ctpgqrt_Tile
+ * @sa MORSE_dtpgqrt_Tile
+ * @sa MORSE_stpgqrt_Tile
+ * @sa MORSE_zgeqrs_Tile
+ *
+ ******************************************************************************/
+int MORSE_ztpgqrt_Tile( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B )
+{
+    MORSE_context_t *morse;
+    MORSE_sequence_t *sequence = NULL;
+    MORSE_request_t request = MORSE_REQUEST_INITIALIZER;
+    int status;
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_fatal_error("MORSE_ztpgqrt_Tile", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+    morse_sequence_create(morse, &sequence);
+    MORSE_ztpgqrt_Tile_Async(L, V, T, A, B, sequence, &request);
+    morse_sequence_wait(morse, sequence);
+    RUNTIME_desc_getoncpu(A);
+    RUNTIME_desc_getoncpu(B);
+
+    status = sequence->status;
+    morse_sequence_destroy(morse, sequence);
+    return status;
+}
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup MORSE_Complex64_t_Tile_Async
+ *
+ *  MORSE_ztpgqrt_Tile_Async - Generates a partial Q matrix formed with a blocked QR
+ *  factorization of a "triangular-pentagonal" matrix C, which is composed of an
+ *  unused triangular block and a pentagonal block V, using the compact
+ *  representation for Q. See MORSE_ztpqrt() to generate V.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa MORSE_ztpgqrt
+ * @sa MORSE_ztpgqrt_Tile
+ * @sa MORSE_ctpgqrt_Tile_Async
+ * @sa MORSE_dtpgqrt_Tile_Async
+ * @sa MORSE_stpgqrt_Tile_Async
+ * @sa MORSE_zgeqrs_Tile_Async
+ *
+ ******************************************************************************/
+int MORSE_ztpgqrt_Tile_Async( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B,
+                              MORSE_sequence_t *sequence, MORSE_request_t *request )
+{
+    MORSE_context_t *morse;
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_ztpgqrt_Tile", "MORSE not initialized");
+        return MORSE_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        morse_fatal_error("MORSE_ztpgqrt_Tile", "NULL sequence");
+        return MORSE_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        morse_fatal_error("MORSE_ztpgqrt_Tile", "NULL request");
+        return MORSE_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == MORSE_SUCCESS)
+        request->status = MORSE_SUCCESS;
+    else
+        return morse_request_fail(sequence, request, MORSE_ERR_SEQUENCE_FLUSHED);
+
+    /* Check descriptors for correctness */
+    if (morse_desc_check(V) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpgqrt_Tile", "invalid third descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (morse_desc_check(T) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpgqrt_Tile", "invalid third descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (morse_desc_check(A) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpgqrt_Tile", "invalid first descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (morse_desc_check(B) != MORSE_SUCCESS) {
+        morse_error("MORSE_ztpgqrt_Tile", "invalid second descriptor");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        morse_error("MORSE_ztpgqrt_Tile", "only square tiles supported");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+    if (((B->m - L) % B->mb) != 0) {
+        morse_error("MORSE_ztpgqrt_Tile", "Triangular part must be aligned with tiles");
+        return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
+    }
+
+    /* if (morse->householder == MORSE_FLAT_HOUSEHOLDER) { */
+    morse_pztpgqrt(L, V, T, A, B, sequence, request);
+    /* } */
+    /* else { */
+    /*    morse_pztpgqrtrh(A, T, MORSE_RHBLK, sequence, request); */
+    /* } */
+
+    return MORSE_SUCCESS;
+}
diff --git a/control/compute_z.h b/control/compute_z.h
index d99406b14..122120e48 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -134,6 +134,7 @@ void morse_pzsyrk(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MO
 void morse_pzsyr2k(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzsytrf(MORSE_enum uplo, MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztile2band(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *descAB, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pztpgqrt( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request );
 void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request );
 void morse_pztradd(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
diff --git a/include/morse_z.h b/include/morse_z.h
index 07c81112a..2718a6bea 100644
--- a/include/morse_z.h
+++ b/include/morse_z.h
@@ -102,6 +102,7 @@ int MORSE_zsyrk(MORSE_enum uplo, MORSE_enum trans, int N, int K, MORSE_Complex64
 int MORSE_zsyr2k(MORSE_enum uplo, MORSE_enum trans, int N, int K, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, MORSE_Complex64_t beta, MORSE_Complex64_t *C, int LDC);
 int MORSE_zsysv(MORSE_enum uplo, int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
 int MORSE_zsytrs(MORSE_enum uplo, int N, int NRHS, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
+int MORSE_ztpgqrt( int M, int N, int K, int L, MORSE_Complex64_t *V, int LDV, MORSE_desc_t *descT, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB );
 int MORSE_ztpqrt( int M, int N, int L, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB, MORSE_desc_t *descT );
 int MORSE_ztradd(MORSE_enum uplo, MORSE_enum trans, int M, int N, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t beta, MORSE_Complex64_t *B, int LDB);
 int MORSE_ztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, int N, int NRHS, MORSE_Complex64_t alpha, MORSE_Complex64_t *A, int LDA, MORSE_Complex64_t *B, int LDB);
@@ -180,6 +181,7 @@ int MORSE_zsyrk_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha,
 int MORSE_zsyr2k_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C);
 int MORSE_zsysv_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
 int MORSE_zsytrs_Tile(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B);
+int MORSE_ztpgqrt_Tile( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B );
 int MORSE_ztpqrt_Tile( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T );
 int MORSE_ztradd_Tile(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B);
 int MORSE_ztrmm_Tile(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B);
@@ -255,6 +257,7 @@ int MORSE_zsytrs_Tile_Async(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, M
 int MORSE_zsymm_Tile_Async(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_zsyrk_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_zsyr2k_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
+int MORSE_ztpgqrt_Tile_Async( int L, MORSE_desc_t *V, MORSE_desc_t *T, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request );
 int MORSE_ztpqrt_Tile_Async( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request );
 int MORSE_ztradd_Tile_Async(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 int MORSE_ztrmm_Tile_Async(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
-- 
GitLab


From 1d9d4d50e74891f32acf7fe3341160e7ff282f0e Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 21 Dec 2016 00:39:32 +0100
Subject: [PATCH 8/8] Update morce_cmake

---
 cmake_modules/morse_cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake_modules/morse_cmake b/cmake_modules/morse_cmake
index bf123957d..ace7c7b0c 160000
--- a/cmake_modules/morse_cmake
+++ b/cmake_modules/morse_cmake
@@ -1 +1 @@
-Subproject commit bf123957de0f13775792b9ee8f788c02ee87ca55
+Subproject commit ace7c7b0ce506774db62a6fc5a3e178c8822bf91
-- 
GitLab