From 250e7c55dec577869a33c57dece49be722b752d1 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 8 Oct 2019 10:30:11 +0200
Subject: [PATCH] Hotfix - QR/LQ

---
 compute/pzgelqf.c                           |   6 +-
 compute/pzgelqf_param.c                     |   7 +-
 compute/pzgelqfrh.c                         |   7 +-
 compute/pzgeqrf.c                           |   6 +-
 compute/pzgeqrf_param.c                     |   7 +-
 compute/pzgeqrfrh.c                         |   7 +-
 compute/pzgetrf_incpiv.c                    |  11 +-
 compute/pzhetrd_he2hb.c                     |  51 ++--
 compute/pzunglq.c                           |   8 +-
 compute/pzunglq_param.c                     |   9 +-
 compute/pzunglqrh.c                         |   6 +-
 compute/pzungqr.c                           |   6 +-
 compute/pzungqr_param.c                     |   5 +-
 compute/pzungqrrh.c                         |   5 +-
 compute/pzunmlq.c                           |  22 +-
 compute/pzunmlq_param.c                     |  20 +-
 compute/pzunmlqrh.c                         |  23 +-
 compute/pzunmqr.c                           |  27 ++-
 compute/pzunmqr_param.c                     |  28 ++-
 compute/pzunmqrrh.c                         |  24 +-
 compute/zgels.c                             | 253 +++++++++++++-------
 compute/zgels_param.c                       | 226 ++++++++++-------
 compute/zunmlq.c                            |   1 -
 compute/zunmqr.c                            |   1 -
 runtime/starpu/control/runtime_descriptor.c |   6 +-
 25 files changed, 492 insertions(+), 280 deletions(-)

diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 64a962bba..a85cad0fd 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -100,15 +100,17 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
             A(k, k), ldak,
             T(k, k), T->mb);
         if ( genD ) {
+            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+            int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamUpper, tempkm, tempkn, A->nb,
+                ChamUpper, tempDkm, tempDkn, A->nb,
                 A(k, k), ldak,
                 D(k), lddk );
 #if defined(CHAMELEON_USE_CUDA)
             INSERT_TASK_zlaset(
                 &options,
-                ChamLower, tempkm, tempkn,
+                ChamLower, tempDkm, tempDkn,
                 0., 1.,
                 D(k), lddk );
 #endif
diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c
index a5284b7c9..511125da4 100644
--- a/compute/pzgelqf_param.c
+++ b/compute/pzgelqf_param.c
@@ -108,15 +108,18 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
                 T(k, p), T->mb);
 
             if ( genD ) {
+                int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+                int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
+
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkm, temppn, A->nb,
+                    ChamUpper, tempDkm, tempDpn, A->nb,
                     A(k, p), ldak,
                     D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamLower, tempkm, temppn,
+                    ChamLower, tempDkm, tempDpn,
                     0., 1.,
                     D(k, p), lddk );
 #endif
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index e4c50ca6a..8eb69f56a 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -103,15 +103,18 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
                 A(k, N), ldak,
                 T(k, N), T->mb);
             if ( genD ) {
+                int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+                int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
+
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkm, tempNn, A->nb,
+                    ChamUpper, tempDkm, tempDNn, A->nb,
                     A(k, N), ldak,
                     D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamLower, tempkm, tempNn,
+                    ChamLower, tempDkm, tempDNn,
                     0., 1.,
                     D(k, N), lddk );
 #endif
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 6d6d3c60b..3f92fa392 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -95,15 +95,17 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
             A(k, k), ldak,
             T(k, k), T->mb);
         if ( genD ) {
+            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+            int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamLower, tempkm, tempkn, A->nb,
+                ChamLower, tempDkm, tempDkn, A->nb,
                 A(k, k), ldak,
                 D(k), lddk );
 #if defined(CHAMELEON_USE_CUDA)
             INSERT_TASK_zlaset(
                 &options,
-                ChamUpper, tempkm, tempkn,
+                ChamUpper, tempDkm, tempDkn,
                 0., 1.,
                 D(k), lddk );
 #endif
diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c
index bb7955797..ae417f76e 100644
--- a/compute/pzgeqrf_param.c
+++ b/compute/pzgeqrf_param.c
@@ -110,15 +110,18 @@ void chameleon_pzgeqrf_param( int genD, int K,
                 T(m, k), T->mb);
 
             if ( genD ) {
+                int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb;
+                int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
+
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempmm, tempkn, A->nb,
+                    ChamLower, tempDmm, tempDkn, A->nb,
                     A(m, k), ldam,
                     D(m, k), lddm );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamUpper, tempmm, tempkn,
+                    ChamUpper, tempDmm, tempDkn,
                     0., 1.,
                     D(m, k), lddm );
 #endif
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index 7f235365d..1182e5bc1 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -101,15 +101,18 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
                 A(M, k), ldaM,
                 T(M, k), T->mb);
             if ( genD ) {
+                int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
+                int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
+
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempMm, tempkn, A->nb,
+                    ChamLower, tempDMm, tempDkn, A->nb,
                     A(M, k), ldaM,
                     D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamUpper, tempMm, tempkn,
+                    ChamUpper, tempDMm, tempDkn,
                     0., 1.,
                     D(M, k), lddM );
 #endif
diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c
index e4a494c68..642970b62 100644
--- a/compute/pzgetrf_incpiv.c
+++ b/compute/pzgetrf_incpiv.c
@@ -49,7 +49,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
     size_t ws_host = 0;
 
     int k, m, n;
-    int ldak, ldam;
+    int ldak, ldam, lddk;
     int tempkm, tempkn, tempmm, tempnn;
     int ib;
     int minMNT = chameleon_min(A->mt, A->nt);
@@ -62,6 +62,10 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
 
     ib = CHAMELEON_IB;
 
+    if ( D == NULL ) {
+        D = A;
+    }
+
     /*
      * zgetrf_incpiv = 0
      * zgessm        = 0
@@ -81,6 +85,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
         ldak = BLKLDD(A, k);
+        lddk = BLKLDD(D, k);
         INSERT_TASK_zgetrf_incpiv(
             &options,
             tempkm, tempkn, ib, L->nb,
@@ -95,7 +100,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
                 &options,
                 ChamUpperLower, tempkm, tempkn, A->nb,
                 A(k, k), ldak,
-                D(k), ldak);
+                D(k),    lddk);
 #endif
         }
 
@@ -106,7 +111,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
                 tempkm, tempnn, tempkm, ib, L->nb,
                 IPIV(k, k),
                 L(k, k), L->mb,
-                D(k), ldak,
+                D(k),    lddk,
                 A(k, n), ldak);
         }
         for (m = k+1; m < A->mt; m++) {
diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c
index 1ef361ea7..a5b1aeb3c 100644
--- a/compute/pzhetrd_he2hb.c
+++ b/compute/pzhetrd_he2hb.c
@@ -23,7 +23,7 @@
 
 #define A(m, n) A,  m,  n
 #define T(m, n) T,  m,  n
-#define D(k) &D, (k)-1, 0
+#define D(k)  &D, (k)-1, 0
 
 #define AT(k) &AT, k, 0
 
@@ -49,6 +49,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
 
     int k, m, n, i, j;
     int ldak, ldak1, ldam, ldan, ldaj, ldai;
+    int lddk, lddk1, lddm, lddn, ldek, ldek1;
     int tempkm, tempkn, tempmm, tempnn, tempjj;
     int ib;
 
@@ -98,12 +99,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
     for (k = 1; k < A->nt; k++){
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
         ldak = BLKLDD(A, k);
+        lddk = BLKLDD((&D), k);
 
         INSERT_TASK_zhe2ge(&options,
                           uplo,
                           tempkn, tempkn, ldak,
                           A(k, k), ldak,
-                          D(k),    ldak);
+                          D(k),    lddk);
     }
 
     if (uplo == ChamLower) {
@@ -113,6 +115,8 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
            tempkm = k+1 == A->mt-1 ? A->m-(k+1)*A->mb : A->mb;
            tempkn = k   == A->nt-1 ? A->n- k   *A->nb : A->nb;
            ldak1 = BLKLDD(A, k+1);
+           lddk1 = BLKLDD((&D), k+1);
+           ldek1 = BLKLDD(E, k+1);
 
            INSERT_TASK_zgeqrt(
                &options,
@@ -125,13 +129,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                &options,
                ChamLower, tempkm, tempkn, A->nb,
                A(k+1, k), ldak1,
-               E(k+1, k), ldak1 );
+               E(k+1, k), ldek1 );
 #if defined(CHAMELEON_USE_CUDA)
            INSERT_TASK_zlaset(
                &options,
                ChamUpper, tempkm, tempkn,
                0., 1.,
-               E(k+1, k), ldak1 );
+               E(k+1, k), ldek1 );
 #endif
 #endif
 
@@ -142,7 +146,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                tempkm, tempkm, ib, A->nb,
                E(k+1, k), ldak1,
                T(k+1, k), T->mb,
-               D(k+1),    ldak1);
+               D(k+1),    lddk1);
 
            /* RIGHT on the remaining tiles until the bottom */
            for (m = k+2; m < A->mt ; m++) {
@@ -152,7 +156,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamRight, ChamNoTrans,
                    tempmm, A->nb, tempkm, ib, A->nb,
-                   E(k+1, k),   ldak1,
+                   E(k+1, k),   ldek1,
                    T(k+1, k),   T->mb,
                    A(m,   k+1), ldam);
            }
@@ -160,6 +164,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
            for (m = k+2; m < A->mt; m++) {
                tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
                ldam = BLKLDD(A, m);
+               lddm = BLKLDD((&D), m);
 
                options.priority = 1;
                INSERT_TASK_ztsqrt(
@@ -222,7 +227,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamLeft, ChamConjTrans,
                    A->mb, A->nb, tempmm, A->nb, A->nb, ib, A->nb,
-                   D(k+1), ldak1,
+                   D(k+1),    lddk1,
                    A(m, k+1), ldam,
                    A(m,   k), ldam,
                    T(m,   k), T->mb);
@@ -233,8 +238,8 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamLeft, ChamConjTrans,
                    A->mb, tempmm, tempmm, tempmm, A->nb, ib, A->nb,
-                   AT(m), ldak1,
-                   D(m) , ldam,
+                   AT(m),    ldak1,
+                   D(m),     lddm,
                    A(m,  k), ldam,
                    T(m,  k), T->mb);
 
@@ -243,7 +248,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamRight, ChamNoTrans,
                    A->mb, A->nb, A->mb, tempmm, A->nb, ib, A->nb,
-                   D(k+1), ldak1,
+                   D(k+1), lddk1,
                    AT(m) , ldak1,
                    A(m,   k), ldam,
                    T(m,   k), T->mb);
@@ -254,7 +259,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    ChamRight, ChamNoTrans,
                    tempmm, A->nb, tempmm, tempmm, A->nb, ib, A->nb,
                    A(m, k+1), ldam,
-                   D(m)  , ldam,
+                   D(m),      lddm,
                    A(m,   k), ldam,
                    T(m,   k), T->mb);
                options.priority = 0;
@@ -270,7 +275,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
            tempkn = k+1 == A->nt-1 ? A->n-(k+1)*A->nb : A->nb;
            tempkm = k   == A->mt-1 ? A->m- k   *A->mb : A->mb;
            ldak  = BLKLDD(A, k);
+           ldek  = BLKLDD(E, k);
            ldak1 = BLKLDD(A, k+1);
+           lddk1 = BLKLDD((&D), k+1);
            INSERT_TASK_zgelqt(
                &options,
                tempkm, tempkn, ib, A->nb,
@@ -282,13 +289,13 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                &options,
                ChamUpper, tempkm, tempkn, A->nb,
                A(k, k+1), ldak,
-               E(k, k+1), ldak );
+               E(k, k+1), ldek );
 #if defined(CHAMELEON_USE_CUDA)
            INSERT_TASK_zlaset(
                &options,
                ChamLower, tempkm, tempkn,
                0., 1.,
-               E(k, k+1), ldak );
+               E(k, k+1), ldek );
 #endif
 #endif
 
@@ -297,9 +304,9 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                &options,
                ChamUpper,
                tempkn, tempkn, ib, A->nb,
-               E(k, k+1), ldak,
+               E(k, k+1), ldek,
                T(k, k+1), T->mb,
-               D(k+1),    ldak1);
+               D(k+1),    lddk1);
 
            /* LEFT on the remaining tiles until the left side */
            for (n = k+2; n < A->nt ; n++) {
@@ -308,7 +315,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamLeft, ChamNoTrans,
                    A->mb, tempnn, tempkn, ib, A->nb,
-                   E(k,   k+1), ldak,
+                   E(k,   k+1), ldek,
                    T(k,   k+1), T->mb,
                    A(k+1, n  ), ldak1);
            }
@@ -316,6 +323,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
            for (n = k+2; n < A->nt; n++) {
                tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
                ldan = BLKLDD(A, n);
+               lddn = BLKLDD((&D), n);
                options.priority = 1;
                INSERT_TASK_ztslqt(
                    &options,
@@ -375,7 +383,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamRight, ChamConjTrans,
                    A->mb, A->nb, A->mb, tempnn, A->nb, ib, A->nb,
-                   D(k+1),    ldak1,
+                   D(k+1),    lddk1,
                    A(k+1, n), ldak1,
                    A(k,   n), ldak,
                    T(k,   n), T->mb);
@@ -386,7 +394,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    ChamRight, ChamConjTrans,
                    tempnn, A->nb, tempnn, tempnn, A->nb, ib, A->nb,
                    AT(n),    A->mb,
-                   D(n),     ldan,
+                   D(n),     lddn,
                    A(k,  n), ldak,
                    T(k,  n), T->mb);
 
@@ -396,7 +404,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    &options,
                    ChamLeft, ChamNoTrans,
                    A->mb, A->nb, tempnn, A->nb, A->nb, ib, A->nb,
-                   D(k+1),  ldak1,
+                   D(k+1),  lddk1,
                    AT(n),   A->mb,
                    A(k, n), ldak,
                    T(k, n), T->mb);
@@ -408,7 +416,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
                    ChamLeft, ChamNoTrans,
                    A->mb, tempnn, tempnn, tempnn, A->nb, ib, A->nb,
                    A(k+1, n), ldak1,
-                   D(n),      ldan,
+                   D(n),      lddn,
                    A(k,   n), ldak,
                    T(k,   n), T->mb);
            }
@@ -422,10 +430,11 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
     for (k = 1; k < A->nt; k++){
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
         ldak = BLKLDD(A, k);
+        lddk = BLKLDD((&D), k);
         INSERT_TASK_zlacpy(&options,
                           uplo,
                           tempkn, tempkn, ldak,
-                          D(k), ldak,
+                          D(k), lddk,
                           A(k, k), ldak);
     }
 
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index 4ab201e7f..63c3697f1 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -99,7 +99,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
 
         for (n = Q->nt-1; n > k; n--) {
             tempnn = n == Q->nt-1 ? Q->n-n*Q->nb : Q->nb;
-            for (m = 0; m < Q->mt; m++) {
+            for (m = k; m < Q->mt; m++) {
                 tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
                 ldqm = BLKLDD(Q, m);
 
@@ -121,15 +121,16 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
         }
 
         if ( genD ) {
+            int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamUpper, tempkmin, tempkn, A->nb,
+                ChamUpper, tempkmin, tempDkn, A->nb,
                 A(k, k), ldak,
                 D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
             INSERT_TASK_zlaset(
                 &options,
-                ChamLower, tempkmin, tempkn,
+                ChamLower, tempkmin, tempDkn,
                 0., 1.,
                 D(k), lddk );
 #endif
@@ -138,6 +139,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
             tempmm = m == Q->mt-1 ? Q->m-m*Q->mb : Q->mb;
             ldqm = BLKLDD(Q, m);
 
+            /* Restore the original location of the tiles */
             RUNTIME_data_migrate( sequence, Q(m, k),
                                   Q->get_rankof( Q, m, k ) );
 
diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c
index fbc164624..e90345ec7 100644
--- a/compute/pzunglq_param.c
+++ b/compute/pzunglq_param.c
@@ -53,10 +53,6 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
 
     ib = CHAMELEON_IB;
 
-    if (D == NULL) {
-        D = A;
-    }
-
     if ( D == NULL ) {
         D    = A;
         genD = 0;
@@ -142,15 +138,16 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
             tempkmin = chameleon_min(tempkm, temppn);
 
             if ( genD ) {
+                int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkmin, temppn, A->nb,
+                    ChamUpper, tempkmin, tempDpn, A->nb,
                     A(k, p), ldak,
                     D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamLower, tempkmin, temppn,
+                    ChamLower, tempkmin, tempDpn,
                     0., 1.,
                     D(k, p), lddk );
 #endif
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index f38171b7f..6bdfcabea 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -149,15 +149,17 @@ void chameleon_pzunglqrh( int genD, int BS,
             }
 
             if ( genD ) {
+                int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
+
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkmin, tempNn, A->nb,
+                    ChamUpper, tempkmin, tempDNn, A->nb,
                     A(k, N), ldak,
                     D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamLower, tempkmin, tempNn,
+                    ChamLower, tempkmin, tempDNn,
                     0., 1.,
                     D(k, N), lddk );
 #endif
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index 4964cbb0b..dda7b25a1 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -123,15 +123,17 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
         }
 
         if ( genD ) {
+            int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+
             INSERT_TASK_zlacpy(
                 &options,
-                ChamLower, tempkm, tempkmin, A->nb,
+                ChamLower, tempDkm, tempkmin, A->nb,
                 A(k, k), ldak,
                 D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
             INSERT_TASK_zlaset(
                 &options,
-                ChamUpper, tempkm, tempkmin,
+                ChamUpper, tempDkm, tempkmin,
                 0., 1.,
                 D(k), lddk );
 #endif
diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c
index e70c16deb..346848b6f 100644
--- a/compute/pzungqr_param.c
+++ b/compute/pzungqr_param.c
@@ -143,15 +143,16 @@ void chameleon_pzungqr_param( int genD, int K,
             ldqm = BLKLDD(Q, m);
 
             if ( genD ) {
+                int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb;
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempmm, tempkmin, A->nb,
+                    ChamLower, tempDmm, tempkmin, A->nb,
                     A(m, k), ldam,
                     D(m, k), lddm );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamUpper, tempmm, tempkmin,
+                    ChamUpper, tempDmm, tempkmin,
                     0., 1.,
                     D(m, k), lddm );
 #endif
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index efec2cadd..8310ae1ae 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -155,15 +155,16 @@ void chameleon_pzungqrrh( int genD, int BS,
             }
 
             if ( genD ) {
+                int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempMm, tempkmin, A->nb,
+                    ChamLower, tempDMm, tempkmin, A->nb,
                     A(M, k), ldaM,
                     D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
                     &options,
-                    ChamUpper, tempMm, tempkmin,
+                    ChamUpper, tempDMm, tempkmin,
                     0., 1.,
                     D(M, k), lddM );
 #endif
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index 1df5d42b1..0aa1054dc 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -98,22 +98,22 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                 RUNTIME_iteration_push(chamctxt, k);
 
                 tempkm   = k == B->mt-1 ? B->m-k*B->mb : B->mb;
-                tempkn   = k == A->nt-1 ? A->n-k*A->nb : A->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
                 lddk = BLKLDD(D, k);
 
                 if ( genD ) {
+                    int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn, A->nb,
                         A(k, k), ldak,
                         D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
-                        ChamLower, tempkmin, tempkn,
+                        ChamLower, tempkmin, tempDkn,
                         0., 1.,
                         D(k), lddk );
 #endif
@@ -172,7 +172,6 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
             for (k = minMT-1; k >= 0; k--) {
                 RUNTIME_iteration_push(chamctxt, k);
 
-                tempkn   = k == A->nt-1 ? A->n-k*A->nb : A->nb;
                 tempkm   = k == B->mt-1 ? B->m-k*B->mb : B->mb;
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
@@ -203,15 +202,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                     RUNTIME_data_flush( sequence, T(k, m) );
                 }
                 if ( genD ) {
+                    int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn, A->nb,
                         A(k, k), ldak,
                         D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
-                        ChamLower, tempkmin, tempkn,
+                        ChamLower, tempkmin, tempDkn,
                         0., 1.,
                         D(k), lddk );
 #endif
@@ -273,15 +273,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                     RUNTIME_data_flush( sequence, T(k, n) );
                 }
                 if ( genD ) {
+                    int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn, A->nb,
                         A(k, k), ldak,
                         D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
-                        ChamLower, tempkmin, tempkn,
+                        ChamLower, tempkmin, tempDkn,
                         0., 1.,
                         D(k), lddk );
 #endif
@@ -321,15 +322,16 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                 lddk = BLKLDD(D, k);
 
                 if ( genD ) {
+                    int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn, A->nb,
                         A(k, k), ldak,
                         D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
-                        ChamLower, tempkmin, tempkn,
+                        ChamLower, tempkmin, tempDkn,
                         0., 1.,
                         D(k), lddk );
 #endif
diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c
index 087cc0f5c..fe122e0b4 100644
--- a/compute/pzunmlq_param.c
+++ b/compute/pzunmlq_param.c
@@ -106,15 +106,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
                     ldbp = BLKLDD(B, p);
 
                     if ( genD ) {
+                        int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, temppn, A->nb,
+                            ChamUpper, tempkmin, tempDpn, A->nb,
                             A(k, p), ldak,
                             D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, temppn,
+                            ChamLower, tempkmin, tempDpn,
                             0., 1.,
                             D(k, p), lddk );
 #endif
@@ -245,15 +246,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
                     ldbp = BLKLDD(B, p);
 
                     if ( genD ) {
+                        int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, temppn, A->nb,
+                            ChamUpper, tempkmin, tempDpn, A->nb,
                             A(k, p), ldak,
                             D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, temppn,
+                            ChamLower, tempkmin, tempDpn,
                             0., 1.,
                             D(k, p), lddk );
 #endif
@@ -341,15 +343,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
                     tempkmin = chameleon_min(tempkm, temppn);
 
                     if ( genD ) {
+                        int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, temppn, A->nb,
+                            ChamUpper, tempkmin, tempDpn, A->nb,
                             A(k, p), ldak,
                             D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, temppn,
+                            ChamLower, tempkmin, tempDpn,
                             0., 1.,
                             D(k, p), lddk );
 #endif
@@ -396,15 +399,16 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
                     tempkmin = chameleon_min(tempkm, temppn);
 
                     if ( genD ) {
+                        int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, temppn, A->nb,
+                            ChamUpper, tempkmin, tempDpn, A->nb,
                             A(k, p), ldak,
                             D(k, p), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, temppn,
+                            ChamLower, tempkmin, tempDpn,
                             0., 1.,
                             D(k, p), lddk );
 #endif
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 243469fcf..c3b7d0c1d 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -102,15 +102,17 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                     tempkmin = chameleon_min(tempkm,tempNn);
                     ldbN = BLKLDD(B, N);
                     if ( genD ) {
+                        int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempNn, A->nb,
+                            ChamUpper, tempkmin, tempDNn, A->nb,
                             A(k, N), ldak,
                             D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, tempNn,
+                            ChamLower, tempkmin, tempDNn,
                             0., 1.,
                             D(k, N), lddk );
 #endif
@@ -254,15 +256,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                         RUNTIME_data_flush( sequence, T(k, m) );
                     }
                     if ( genD ) {
+                        int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempNn, A->nb,
+                            ChamUpper, tempkmin, tempDNn, A->nb,
                             A(k, N), ldak,
                             D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, tempNn,
+                            ChamLower, tempkmin, tempDNn,
                             0., 1.,
                             D(k, N), lddk );
 #endif
@@ -339,7 +342,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                             node = B->get_rankof( B, m, n );
                             RUNTIME_data_migrate( sequence, B(m, N), node );
-                            RUNTIME_data_migrate( sequence, B(m, m), node );
+                            RUNTIME_data_migrate( sequence, B(m, n), node );
 
                             /* TS kernel */
                             INSERT_TASK_ztpmlqt(
@@ -355,15 +358,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                         RUNTIME_data_flush( sequence, T(k, n) );
                     }
                     if ( genD ) {
+                        int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempNn, A->nb,
+                            ChamUpper, tempkmin, tempDNn, A->nb,
                             A(k, N), ldak,
                             D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, tempNn,
+                            ChamLower, tempkmin, tempDNn,
                             0., 1.,
                             D(k, N), lddk );
 #endif
@@ -404,15 +408,16 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                     tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
                     tempkmin = chameleon_min(tempkm,tempNn);
                     if ( genD ) {
+                        int tempDNn = N == D->nt-1 ? D->n-N*D->nb : D->nb;
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempNn, A->nb,
+                            ChamUpper, tempkmin, tempDNn, A->nb,
                             A(k, N), ldak,
                             D(k, N), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamLower, tempkmin, tempNn,
+                            ChamLower, tempkmin, tempDNn,
                             0., 1.,
                             D(k, N), lddk );
 #endif
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index 0da2a1fd3..c872ea0de 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -103,15 +103,17 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                 lddk = BLKLDD(D, k);
                 ldbk = BLKLDD(B, k);
                 if ( genD ) {
+                    int tempDkm = k == D->mt-1 ? D->m-k*D->mb : D->mb;
+
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamLower, tempkm, tempkmin, A->nb,
+                        ChamLower, tempDkm, tempkmin, A->nb,
                         A(k, k), ldak,
                         D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
-                        ChamUpper, tempkm, tempkmin,
+                        ChamUpper, tempDkm, tempkmin,
                         0., 1.,
                         D(k), lddk );
 #endif
@@ -175,6 +177,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
+                lddk = BLKLDD(D, k);
                 for (m = B->mt-1; m > k; m--) {
                     tempmm = m == B->mt-1 ? B->m-m*B->mb : B->mb;
                     ldam = BLKLDD(A, m);
@@ -204,13 +207,13 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                         &options,
                         ChamLower, tempkm, tempkmin, A->nb,
                         A(k, k), ldak,
-                        D(k), ldak );
+                        D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
                         ChamUpper, tempkm, tempkmin,
                         0., 1.,
-                        D(k), ldak );
+                        D(k), lddk );
 #endif
                 }
                 for (n = 0; n < B->nt; n++) {
@@ -223,7 +226,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                         &options,
                         side, trans,
                         tempkm, tempnn, tempkmin, ib, T->nb,
-                        D(k), ldak,
+                        D(k),    lddk,
                         T(k, k), T->mb,
                         B(k, n), ldbk);
                 }
@@ -244,6 +247,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                 tempkn   = k == B->nt - 1 ? B->n - k * B->nb : B->nb;
                 tempkmin = k == minMT - 1 ? minM - k * A->nb : A->nb;
                 ldak = BLKLDD(A, k);
+                lddk = BLKLDD(D, k);
                 for (n = B->nt-1; n > k; n--) {
                     tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
                     ldan = BLKLDD(A, n);
@@ -273,13 +277,13 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                         &options,
                         ChamLower, tempkn, tempkmin, A->nb,
                         A(k, k), ldak,
-                        D(k), ldak );
+                        D(k),    lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
                         ChamUpper, tempkn, tempkmin,
                         0., 1.,
-                        D(k), ldak );
+                        D(k), lddk );
 #endif
                 }
                 for (m = 0; m < B->mt; m++) {
@@ -293,7 +297,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        D(k), ldak,
+                        D(k), lddk,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
@@ -314,18 +318,19 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                 tempkn   = k == B->nt-1 ? B->n-k*B->nb : B->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
+                lddk = BLKLDD(D, k);
                 if ( genD ) {
                     INSERT_TASK_zlacpy(
                         &options,
                         ChamLower, tempkn, tempkmin, A->nb,
                         A(k, k), ldak,
-                        D(k), ldak );
+                        D(k), lddk );
 #if defined(CHAMELEON_USE_CUDA)
                     INSERT_TASK_zlaset(
                         &options,
                         ChamUpper, tempkn, tempkmin,
                         0., 1.,
-                        D(k), ldak );
+                        D(k), lddk );
 #endif
                 }
                 for (m = 0; m < B->mt; m++) {
@@ -335,7 +340,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        D(k), ldak,
+                        D(k), lddk,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c
index a771077fa..4b6696da4 100644
--- a/compute/pzunmqr_param.c
+++ b/compute/pzunmqr_param.c
@@ -106,15 +106,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     ldbm = BLKLDD(B, m);
 
                     if ( genD ) {
+                        int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempmm, tempkmin, A->nb,
+                            ChamLower, tempDmm, tempkmin, A->nb,
                             A(m, k), ldam,
                             D(m, k), lddm );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempmm, tempkmin,
+                            ChamUpper, tempDmm, tempkmin,
                             0., 1.,
                             D(m, k), lddm );
 #endif
@@ -215,7 +217,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                         L = tempmm;
                         T = TT;
                     }
-                    for (n = k; n < B->nt; n++) {
+                    for (n = 0; n < B->nt; n++) {
                         tempnn = n == B->nt-1 ? B->n-n*B->nb : B->nb;
 
                         node = B->get_rankof( B, m, n );
@@ -246,15 +248,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     ldbm = BLKLDD(B, m);
 
                     if ( genD ) {
+                        int tempDmm = m == D->mt-1 ? D->m-m*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempmm, tempkmin, A->nb,
+                            ChamLower, tempDmm, tempkmin, A->nb,
                             A(m, k), ldam,
                             D(m, k), lddm );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempmm, tempkmin,
+                            ChamUpper, tempDmm, tempkmin,
                             0., 1.,
                             D(m, k), lddm );
 #endif
@@ -309,7 +313,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     }
                     else {
                         /* TT kernel */
-                        L = tempmm;
+                        L = A->mb;
                         T = TT;
                     }
 
@@ -344,15 +348,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     lddn = BLKLDD(D, n);
 
                     if ( genD ) {
+                        int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempnn, tempkmin, A->nb,
+                            ChamLower, tempDnn, tempkmin, A->nb,
                             A(n, k), ldan,
                             D(n, k), lddn );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempnn, tempkmin,
+                            ChamUpper, tempDnn, tempkmin,
                             0., 1.,
                             D(n, k), lddn );
 #endif
@@ -397,15 +403,17 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
                     lddn = BLKLDD(D, n);
 
                     if ( genD ) {
+                        int tempDnn = n == D->nt-1 ? D->n-n*D->nb : D->nb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempnn, tempkmin, A->nb,
+                            ChamLower, tempDnn, tempkmin, A->nb,
                             A(n, k), ldan,
                             D(n, k), lddn );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempnn, tempkmin,
+                            ChamUpper, tempDnn, tempkmin,
                             0., 1.,
                             D(n, k), lddn );
 #endif
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index 02a0f2b8e..7ba65c846 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -102,15 +102,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                     lddM = BLKLDD(D, M);
                     ldbM = BLKLDD(B, M);
                     if ( genD ) {
+                        int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempMm, tempkmin, A->nb,
+                            ChamLower, tempDMm, tempkmin, A->nb,
                             A(M, k), ldaM,
                             D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempMm, tempkmin,
+                            ChamUpper, tempDMm, tempkmin,
                             0., 1.,
                             D(M, k), lddM );
 #endif
@@ -254,15 +256,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                         RUNTIME_data_flush( sequence, T(m, k) );
                     }
                     if ( genD ) {
+                        int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempMm, tempkmin, A->nb,
+                            ChamLower, tempDMm, tempkmin, A->nb,
                             A(M, k), ldaM,
                             D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempMm, tempkmin,
+                            ChamUpper, tempDMm, tempkmin,
                             0., 1.,
                             D(M, k), lddM );
 #endif
@@ -353,15 +357,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                         RUNTIME_data_flush( sequence, T(n, k) );
                     }
                     if ( genD ) {
+                        int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempMm, tempkmin, A->nb,
+                            ChamLower, tempDMm, tempkmin, A->nb,
                             A(M, k), ldaM,
                             D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempMm, tempkmin,
+                            ChamUpper, tempDMm, tempkmin,
                             0., 1.,
                             D(M, k), lddM );
 #endif
@@ -401,15 +407,17 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
                     ldaM = BLKLDD(A, M);
                     lddM = BLKLDD(D, M);
                     if ( genD ) {
+                        int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
+
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempMm, tempkmin, A->nb,
+                            ChamLower, tempDMm, tempkmin, A->nb,
                             A(M, k), ldaM,
                             D(M, k), lddM );
 #if defined(CHAMELEON_USE_CUDA)
                         INSERT_TASK_zlaset(
                             &options,
-                            ChamUpper, tempMm, tempkmin,
+                            ChamUpper, tempDMm, tempkmin,
                             0., 1.,
                             D(M, k), lddM );
 #endif
diff --git a/compute/zgels.c b/compute/zgels.c
index 2aed98e7d..b132610a8 100644
--- a/compute/zgels.c
+++ b/compute/zgels.c
@@ -12,8 +12,6 @@
  * @brief Chameleon zgels wrappers
  *
  * @version 0.9.2
- * @comment This file has been automatically generated
- *          from Plasma 2.5.0 for CHAMELEON 0.9.2
  * @author Jakub Kurzak
  * @author Mathieu Faverge
  * @author Emmanuel Agullo
@@ -26,22 +24,24 @@
 #include <stdlib.h>
 
 /**
+ *******************************************************************************
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  CHAMELEON_zgels - solves overdetermined or underdetermined linear systems involving an M-by-N
- *  matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
- *  The following options are provided:
+ *  CHAMELEON_zgels - solves overdetermined or underdetermined linear systems
+ *  involving an M-by-N matrix A using the QR or the LQ factorization of A.  It
+ *  is assumed that A has full rank.  The following options are provided:
  *
- *  # trans = ChamNoTrans and M >= N: find the least squares solution of an overdetermined
- *    system, i.e., solve the least squares problem: minimize || B - A*X ||.
+ *  # trans = ChamNoTrans and M >= N: find the least squares solution of an
+ *    overdetermined system, i.e., solve the least squares problem: minimize
+ *    || B - A*X ||.
  *
- *  # trans = ChamNoTrans and M < N:  find the minimum norm solution of an underdetermined
- *    system A * X = B.
+ *  # trans = ChamNoTrans and M < N: find the minimum norm solution of an
+ *    underdetermined system A * X = B.
  *
- *  Several right hand side vectors B and solution vectors X can be handled in a single call;
- *  they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
- *  solution matrix X.
+ *  Several right hand side vectors B and solution vectors X can be handled in a
+ *  single call; they are stored as the columns of the M-by-NRHS right hand side
+ *  matrix B and the N-by-NRHS solution matrix X.
  *
  *******************************************************************************
  *
@@ -49,7 +49,6 @@
  *          Intended usage:
  *          = ChamNoTrans:   the linear system involves A;
  *          = ChamConjTrans: the linear system involves A^H.
- *          Currently only ChamNoTrans is supported.
  *
  * @param[in] M
  *          The number of rows of the matrix A. M >= 0.
@@ -58,16 +57,15 @@
  *          The number of columns of the matrix A. N >= 0.
  *
  * @param[in] NRHS
- *          The number of right hand sides, i.e., the number of columns of the matrices B and X.
- *          NRHS >= 0.
+ *          The number of right hand sides, i.e., the number of columns of the
+ *          matrices B and X.  NRHS >= 0.
  *
  * @param[in,out] A
  *          On entry, the M-by-N matrix A.
- *          On exit,
- *          if M >= N, A is overwritten by details of its QR factorization as returned by
- *                     CHAMELEON_zgeqrf;
- *          if M < N, A is overwritten by details of its LQ factorization as returned by
- *                      CHAMELEON_zgelqf.
+ *          On exit, if M >= N, A is overwritten by details of its QR
+ *          factorization as returned by CHAMELEON_zgeqrf; if M < N, A is
+ *          overwritten by details of its LQ factorization as returned by
+ *          CHAMELEON_zgelqf.
  *
  * @param[in] LDA
  *          The leading dimension of the array A. LDA >= max(1,M).
@@ -76,13 +74,14 @@
  *          On exit, auxiliary factorization data.
  *
  * @param[in,out] B
- *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
- *          On exit, if return value = 0, B is overwritten by the solution vectors, stored
- *          columnwise:
- *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
- *          sum of squares for the solution in each column is given by the sum of squares of the
- *          modulus of elements N+1 to M in that column;
- *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
+ *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored
+ *          columnwise;
+ *          On exit, if return value = 0, B is overwritten by the solution
+ *          vectors, stored columnwise: if M >= N, rows 1 to N of B contain the
+ *          least squares solution vectors; the residual sum of squares for the
+ *          solution in each column is given by the sum of squares of the
+ *          modulus of elements N+1 to M in that column; if M < N, rows 1 to N
+ *          of B contain the minimum norm solution vectors;
  *
  * @param[in] LDB
  *          The leading dimension of the array B. LDB >= MAX(1,M,N).
@@ -102,9 +101,9 @@
  *
  */
 int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
-                 CHAMELEON_Complex64_t *A, int LDA,
-                 CHAM_desc_t *descT,
-                 CHAMELEON_Complex64_t *B, int LDB )
+                     CHAMELEON_Complex64_t *A, int LDA,
+                     CHAM_desc_t *descT,
+                     CHAMELEON_Complex64_t *B, int LDB )
 {
     int i, j;
     int NB;
@@ -122,8 +121,8 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
     }
 
     /* Check input arguments */
-    if (trans != ChamNoTrans) {
-        chameleon_error("CHAMELEON_zgels", "only ChamNoTrans supported");
+    if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) {
+        chameleon_error("CHAMELEON_zgels", "illegal value of trans");
         return CHAMELEON_ERR_NOT_SUPPORTED;
     }
     if (M < 0) {
@@ -169,15 +168,14 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
     /* Submit the matrix conversion */
     if ( M >= N ) {
         chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower,
-                         A, NB, NB, LDA, N, M, N, sequence, &request );
+                             A, NB, NB, LDA, N, M, N, sequence, &request );
         chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower,
-                         B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request );
+                             B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request );
     } else {
-        /* Submit the matrix conversion */
         chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower,
-                         A, NB, NB, LDA, N, M, N, sequence, &request );
+                             A, NB, NB, LDA, N, M, N, sequence, &request );
         chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower,
-                         B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request );
+                             B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request );
     }
 
     /* Call the tile interface */
@@ -185,9 +183,9 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
 
     /* Submit the matrix conversion back */
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
-                     ChamDescInout, ChamUpperLower, sequence, &request );
+                         ChamDescInout, ChamUpperLower, sequence, &request );
     chameleon_ztile2lap( chamctxt, &descBl, &descBt,
-                     ChamDescInout, ChamUpperLower, sequence, &request );
+                         ChamDescInout, ChamUpperLower, sequence, &request );
     CHAMELEON_Desc_Flush( descT, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
@@ -206,8 +204,9 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
  *
  * @ingroup CHAMELEON_Complex64_t_Tile
  *
- *  CHAMELEON_zgels_Tile - Solves overdetermined or underdetermined linear system of equations
- *  using the tile QR or the tile LQ factorization.
+ *  CHAMELEON_zgels_Tile - solves overdetermined or underdetermined linear
+ *  systems involving an M-by-N matrix A using the QR or the LQ factorization of
+ *  A.
  *  Tile equivalent of CHAMELEON_zgels().
  *  Operates on matrices stored by tiles.
  *  All matrices are passed through descriptors.
@@ -219,27 +218,26 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
  *          Intended usage:
  *          = ChamNoTrans:   the linear system involves A;
  *          = ChamConjTrans: the linear system involves A^H.
- *          Currently only ChamNoTrans is supported.
  *
  * @param[in,out] A
  *          On entry, the M-by-N matrix A.
- *          On exit,
- *          if M >= N, A is overwritten by details of its QR factorization as returned by
- *                     CHAMELEON_zgeqrf;
- *          if M < N, A is overwritten by details of its LQ factorization as returned by
- *                      CHAMELEON_zgelqf.
+ *          On exit, if M >= N, A is overwritten by details of its QR
+ *          factorization as returned by CHAMELEON_zgeqrf; if M < N, A is
+ *          overwritten by details of its LQ factorization as returned by
+ *          CHAMELEON_zgelqf.
  *
  * @param[out] T
  *          On exit, auxiliary factorization data.
  *
  * @param[in,out] B
- *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
- *          On exit, if return value = 0, B is overwritten by the solution vectors, stored
- *          columnwise:
- *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
- *          sum of squares for the solution in each column is given by the sum of squares of the
- *          modulus of elements N+1 to M in that column;
- *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
+ *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored
+ *          columnwise;
+ *          On exit, if return value = 0, B is overwritten by the solution
+ *          vectors, stored columnwise: if M >= N, rows 1 to N of B contain the
+ *          least squares solution vectors; the residual sum of squares for the
+ *          solution in each column is given by the sum of squares of the
+ *          modulus of elements N+1 to M in that column; if M < N, rows 1 to N
+ *          of B contain the minimum norm solution vectors;
  *
  *******************************************************************************
  *
@@ -255,7 +253,7 @@ int CHAMELEON_zgels( cham_trans_t trans, int M, int N, int NRHS,
  *
  */
 int CHAMELEON_zgels_Tile( cham_trans_t trans, CHAM_desc_t *A,
-                      CHAM_desc_t *T, CHAM_desc_t *B )
+                          CHAM_desc_t *T, CHAM_desc_t *B )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
@@ -311,8 +309,8 @@ int CHAMELEON_zgels_Tile( cham_trans_t trans, CHAM_desc_t *A,
  *
  */
 int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
-                            CHAM_desc_t *T, CHAM_desc_t *B,
-                            RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+                                CHAM_desc_t *T, CHAM_desc_t *B,
+                                RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_desc_t *subA;
     CHAM_desc_t *subB;
@@ -341,35 +339,28 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
     }
 
     /* Check descriptors for correctness */
+    if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) {
+        chameleon_error("CHAMELEON_zgels_Tile_Async", "illegal value of trans");
+        return CHAMELEON_ERR_NOT_SUPPORTED;
+    }
     if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
-        chameleon_error("CHAMELEON_zgels_Tile", "invalid first descriptor");
+        chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid first descriptor");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
     if (chameleon_desc_check(T) != CHAMELEON_SUCCESS) {
-        chameleon_error("CHAMELEON_zgels_Tile", "invalid second descriptor");
+        chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid second descriptor");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
     if (chameleon_desc_check(B) != CHAMELEON_SUCCESS) {
-        chameleon_error("CHAMELEON_zgels_Tile", "invalid third descriptor");
+        chameleon_error("CHAMELEON_zgels_Tile_Async", "invalid third descriptor");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
     /* Check input arguments */
     if (A->nb != A->mb || B->nb != B->mb) {
-        chameleon_error("CHAMELEON_zgels_Tile", "only square tiles supported");
+        chameleon_error("CHAMELEON_zgels_Tile_Async", "only square tiles supported");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
-    if (trans != ChamNoTrans) {
-        chameleon_error("CHAMELEON_zgels_Tile", "only ChamNoTrans supported");
-        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_NOT_SUPPORTED);
-    }
-    /* Quick return  - currently NOT equivalent to LAPACK's:
-     if (chameleon_min(M, chameleon_min(N, NRHS)) == 0) {
-     for (i = 0; i < chameleon_max(M, N); i++)
-     for (j = 0; j < NRHS; j++)
-     B[j*LDB+i] = 0.0;
-     return CHAMELEON_SUCCESS;
-     }
-     */
+
     if (A->m >= A->n) {
 #if defined(CHAMELEON_COPY_DIAG)
         {
@@ -378,24 +369,67 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
             Dptr = &D;
         }
 #endif
+        /*
+         * compute QR factorization of A
+         */
         if (chamctxt->householder == ChamFlatHouseholder) {
             chameleon_pzgeqrf( 1, A, T, Dptr, sequence, request );
-
-            chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
         }
         else {
             chameleon_pzgeqrfrh( 1, CHAMELEON_RHBLK, A, T, Dptr, sequence, request );
+        }
+
+        /* Perform the solve part */
+        if ( trans == ChamNoTrans ) {
+            /*
+             * Least-Squares Problem min || A * X - B ||
+             *
+             * B(1:M,1:NRHS) := Q**H * B(1:M,1:NRHS)
+             */
+            if (chamctxt->householder == ChamFlatHouseholder) {
+                chameleon_pzunmqr( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            }
+            else {
+                chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            }
 
-            chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            /*
+             * B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
+            chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+        }
+        else {
+            /*
+             * Underdetermined system of equations A**H * X = B
+             *
+             * B(1:N,1:NRHS) := inv(R**H) * B(1:N,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
+            chameleon_pztrsm( ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+
+            /*
+             * B(M+1:N,1:NRHS) = 0
+             */
+            /* TODO: enable subdescriptor with non aligned starting point in laset */
+            /* free(subB); */
+            /* subB = chameleon_desc_submatrix(B, A->n, 0, A->m-A->n, B->n); */
+            /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */
+
+            /*
+             * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS)
+             */
+            if (chamctxt->householder == ChamFlatHouseholder) {
+                chameleon_pzunmqr( 0, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request );
+            }
+            else {
+                chameleon_pzunmqrrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request );
+            }
         }
-        subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
-        subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
-        chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
     }
     else {
-        /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n);
-         chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request );
-         free(subB); */
 #if defined(CHAMELEON_COPY_DIAG)
         {
             int m = chameleon_min(A->m, A->n);
@@ -403,21 +437,64 @@ int CHAMELEON_zgels_Tile_Async( cham_trans_t trans, CHAM_desc_t *A,
             Dptr = &D;
         }
 #endif
+        /*
+         * Compute LQ factorization of A
+         */
         if (chamctxt->householder == ChamFlatHouseholder) {
             chameleon_pzgelqf( 1, A, T, Dptr, sequence, request );
         }
         else {
             chameleon_pzgelqfrh( 1, CHAMELEON_RHBLK, A, T, Dptr, sequence, request );
         }
-        subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
-        subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
-        chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
 
-        if (chamctxt->householder == ChamFlatHouseholder) {
-            chameleon_pzunmlq( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+        /* Perform the solve part */
+        if ( trans == ChamNoTrans ) {
+            /*
+             * Underdetermined system of equations A * X = B
+             *
+             * B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
+            chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+
+            /*
+             * B(M+1:N,1:NRHS) = 0
+             */
+            /* TODO: enable subdescriptor with non aligned starting point in laset */
+            /* free(subB); */
+            /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n); */
+            /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */
+
+            /*
+             * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS)
+             */
+            if (chamctxt->householder == ChamFlatHouseholder) {
+                chameleon_pzunmlq( 0, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            }
+            else {
+                chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            }
         }
         else {
-            chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamConjTrans, A, B, T, Dptr, sequence, request );
+            /*
+             * Overdetermined system min || A**H * X - B ||
+             *
+             * B(1:N,1:NRHS) := Q * B(1:N,1:NRHS)
+             */
+            if (chamctxt->householder == ChamFlatHouseholder) {
+                chameleon_pzunmlq( 0, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request );
+            }
+            else {
+                chameleon_pzunmlqrh( 0, CHAMELEON_RHBLK, ChamLeft, ChamNoTrans, A, B, T, Dptr, sequence, request );
+            }
+
+            /*
+             * B(1:M,1:NRHS) := inv(L**H) * B(1:M,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
+            chameleon_pztrsm( ChamLeft, ChamLower, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
         }
     }
 
diff --git a/compute/zgels_param.c b/compute/zgels_param.c
index 697e1b4ff..01999735f 100644
--- a/compute/zgels_param.c
+++ b/compute/zgels_param.c
@@ -26,19 +26,20 @@
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  CHAMELEON_zgels_param - solves overdetermined or underdetermined linear systems involving an M-by-N
- *  matrix A using the QR or the LQ factorization of A.  It is assumed that A has full rank.
- *  The following options are provided:
+ *  CHAMELEON_zgels_param - solves overdetermined or underdetermined linear
+ *  systems involving an M-by-N matrix A using the QR or the LQ factorization of
+ *  A.  It is assumed that A has full rank.  The following options are provided:
  *
- *  # trans = ChamNoTrans and M >= N: find the least squares solution of an overdetermined
- *    system, i.e., solve the least squares problem: minimize || B - A*X ||.
+ *  # trans = ChamNoTrans and M >= N: find the least squares solution of an
+ *    overdetermined system, i.e., solve the least squares problem: minimize
+ *    || B - A*X ||.
  *
- *  # trans = ChamNoTrans and M < N:  find the minimum norm solution of an underdetermined
- *    system A * X = B.
+ *  # trans = ChamNoTrans and M < N: find the minimum norm solution of an
+ *    underdetermined system A * X = B.
  *
- *  Several right hand side vectors B and solution vectors X can be handled in a single call;
- *  they are stored as the columns of the M-by-NRHS right hand side matrix B and the N-by-NRHS
- *  solution matrix X.
+ *  Several right hand side vectors B and solution vectors X can be handled in a
+ *  single call; they are stored as the columns of the M-by-NRHS right hand side
+ *  matrix B and the N-by-NRHS solution matrix X.
  *
  *******************************************************************************
  *
@@ -49,7 +50,6 @@
  *          Intended usage:
  *          = ChamNoTrans:   the linear system involves A;
  *          = ChamConjTrans: the linear system involves A^H.
- *          Currently only ChamNoTrans is supported.
  *
  * @param[in] M
  *          The number of rows of the matrix A. M >= 0.
@@ -58,16 +58,15 @@
  *          The number of columns of the matrix A. N >= 0.
  *
  * @param[in] NRHS
- *          The number of right hand sides, i.e., the number of columns of the matrices B and X.
- *          NRHS >= 0.
+ *          The number of right hand sides, i.e., the number of columns of the
+ *          matrices B and X.  NRHS >= 0.
  *
  * @param[in,out] A
  *          On entry, the M-by-N matrix A.
- *          On exit,
- *          if M >= N, A is overwritten by details of its QR factorization as returned by
- *                     CHAMELEON_zgeqrf;
- *          if M < N, A is overwritten by details of its LQ factorization as returned by
- *                      CHAMELEON_zgelqf.
+ *          On exit, if M >= N, A is overwritten by details of its QR
+ *          factorization as returned by CHAMELEON_zgeqrf; if M < N, A is
+ *          overwritten by details of its LQ factorization as returned by
+ *          CHAMELEON_zgelqf.
  *
  * @param[in] LDA
  *          The leading dimension of the array A. LDA >= max(1,M).
@@ -79,13 +78,14 @@
  *          On exit, auxiliary factorization data.
  *
  * @param[in,out] B
- *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
- *          On exit, if return value = 0, B is overwritten by the solution vectors, stored
- *          columnwise:
- *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
- *          sum of squares for the solution in each column is given by the sum of squares of the
- *          modulus of elements N+1 to M in that column;
- *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
+ *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored
+ *          columnwise;
+ *          On exit, if return value = 0, B is overwritten by the solution
+ *          vectors, stored columnwise: if M >= N, rows 1 to N of B contain the
+ *          least squares solution vectors; the residual sum of squares for the
+ *          solution in each column is given by the sum of squares of the
+ *          modulus of elements N+1 to M in that column; if M < N, rows 1 to N
+ *          of B contain the minimum norm solution vectors;
  *
  * @param[in] LDB
  *          The leading dimension of the array B. LDB >= MAX(1,M,N).
@@ -105,9 +105,9 @@
  *
  */
 int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int M, int N, int NRHS,
-                       CHAMELEON_Complex64_t *A, int LDA,
-                       CHAM_desc_t *descTS, CHAM_desc_t *descTT,
-                       CHAMELEON_Complex64_t *B, int LDB )
+                           CHAMELEON_Complex64_t *A, int LDA,
+                           CHAM_desc_t *descTS, CHAM_desc_t *descTT,
+                           CHAMELEON_Complex64_t *B, int LDB )
 {
     int i, j;
     int NB;
@@ -125,8 +125,8 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
     }
 
     /* Check input arguments */
-    if (trans != ChamNoTrans) {
-        chameleon_error("CHAMELEON_zgels_param", "only ChamNoTrans supported");
+    if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) {
+        chameleon_error("CHAMELEON_zgels_param", "illegal value of trans");
         return CHAMELEON_ERR_NOT_SUPPORTED;
     }
     if (M < 0) {
@@ -172,14 +172,14 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
     /* Submit the matrix conversion */
     if ( M >= N ) {
         chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower,
-                         A, NB, NB, LDA, N, M, N, sequence, &request );
+                             A, NB, NB, LDA, N, M, N, sequence, &request );
         chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower,
-                         B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request );
+                             B, NB, NB, LDB, NRHS, M, NRHS, sequence, &request );
     } else {
         chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, ChamUpperLower,
-                         A, NB, NB, LDA, N, M, N, sequence, &request );
+                             A, NB, NB, LDA, N, M, N, sequence, &request );
         chameleon_zlap2tile( chamctxt, &descBl, &descBt, ChamDescInout, ChamUpperLower,
-                         B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request );
+                             B, NB, NB, LDB, NRHS, N, NRHS, sequence, &request );
     }
 
     /* Call the tile interface */
@@ -187,9 +187,9 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
 
     /* Submit the matrix conversion back */
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
-                     ChamDescInout, ChamUpperLower, sequence, &request );
+                         ChamDescInout, ChamUpperLower, sequence, &request );
     chameleon_ztile2lap( chamctxt, &descBl, &descBt,
-                     ChamDescInout, ChamUpperLower, sequence, &request );
+                         ChamDescInout, ChamUpperLower, sequence, &request );
     CHAMELEON_Desc_Flush( descTS, sequence );
     CHAMELEON_Desc_Flush( descTT, sequence );
 
@@ -205,12 +205,13 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
 }
 
 /**
- *******************************************************************************
+ ********************************************************************************
  *
  * @ingroup CHAMELEON_Complex64_t_Tile
  *
- *  CHAMELEON_zgels_param_Tile - Solves overdetermined or underdetermined linear system of equations
- *  using the tile QR or the tile LQ factorization.
+ *  CHAMELEON_zgels_param_Tile - solves overdetermined or underdetermined linear
+ *  systems involving an M-by-N matrix A using the QR or the LQ factorization of
+ *  A.
  *  Tile equivalent of CHAMELEON_zgels_param().
  *  Operates on matrices stored by tiles.
  *  All matrices are passed through descriptors.
@@ -222,15 +223,13 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
  *          Intended usage:
  *          = ChamNoTrans:   the linear system involves A;
  *          = ChamConjTrans: the linear system involves A^H.
- *          Currently only ChamNoTrans is supported.
  *
  * @param[in,out] A
  *          On entry, the M-by-N matrix A.
- *          On exit,
- *          if M >= N, A is overwritten by details of its QR factorization as returned by
- *                     CHAMELEON_zgeqrf;
- *          if M < N, A is overwritten by details of its LQ factorization as returned by
- *                      CHAMELEON_zgelqf.
+ *          On exit, if M >= N, A is overwritten by details of its QR
+ *          factorization as returned by CHAMELEON_zgeqrf; if M < N, A is
+ *          overwritten by details of its LQ factorization as returned by
+ *          CHAMELEON_zgelqf.
  *
  * @param[out] TS
  *          On exit, auxiliary factorization data.
@@ -239,13 +238,14 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
  *          On exit, auxiliary factorization data.
  *
  * @param[in,out] B
- *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored columnwise;
- *          On exit, if return value = 0, B is overwritten by the solution vectors, stored
- *          columnwise:
- *          if M >= N, rows 1 to N of B contain the least squares solution vectors; the residual
- *          sum of squares for the solution in each column is given by the sum of squares of the
- *          modulus of elements N+1 to M in that column;
- *          if M < N, rows 1 to N of B contain the minimum norm solution vectors;
+ *          On entry, the M-by-NRHS matrix B of right hand side vectors, stored
+ *          columnwise;
+ *          On exit, if return value = 0, B is overwritten by the solution
+ *          vectors, stored columnwise: if M >= N, rows 1 to N of B contain the
+ *          least squares solution vectors; the residual sum of squares for the
+ *          solution in each column is given by the sum of squares of the
+ *          modulus of elements N+1 to M in that column; if M < N, rows 1 to N
+ *          of B contain the minimum norm solution vectors;
  *
  *******************************************************************************
  *
@@ -261,7 +261,7 @@ int CHAMELEON_zgels_param( const libhqr_tree_t *qrtree, cham_trans_t trans, int
  *
  */
 int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans, CHAM_desc_t *A,
-                            CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B )
+                                CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
@@ -289,7 +289,7 @@ int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans,
 }
 
 /**
- *******************************************************************************
+ ********************************************************************************
  *
  * @ingroup CHAMELEON_Complex64_t_Tile_Async
  *
@@ -318,8 +318,8 @@ int CHAMELEON_zgels_param_Tile( const libhqr_tree_t *qrtree, cham_trans_t trans,
  *
  */
 int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t trans, CHAM_desc_t *A,
-                                  CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B,
-                                  RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+                                      CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *B,
+                                      RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_desc_t *subA;
     CHAM_desc_t *subB;
@@ -348,6 +348,10 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
     }
 
     /* Check descriptors for correctness */
+    if ( (trans != ChamNoTrans) && (trans != ChamConjTrans) ) {
+        chameleon_error("CHAMELEON_zgels_param_Tile_Async", "illegal value of trans");
+        return CHAMELEON_ERR_NOT_SUPPORTED;
+    }
     if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
         chameleon_error("CHAMELEON_zgels_param_Tile", "invalid first descriptor");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
@@ -369,20 +373,8 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
         chameleon_error("CHAMELEON_zgels_param_Tile", "only square tiles supported");
         return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
     }
-    if (trans != ChamNoTrans) {
-        chameleon_error("CHAMELEON_zgels_param_Tile", "only ChamNoTrans supported");
-        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_NOT_SUPPORTED);
-    }
-    /* Quick return  - currently NOT equivalent to LAPACK's:
-     if (chameleon_min(M, chameleon_min(N, NRHS)) == 0) {
-     for (i = 0; i < chameleon_max(M, N); i++)
-     for (j = 0; j < NRHS; j++)
-     B[j*LDB+i] = 0.0;
-     return CHAMELEON_SUCCESS;
-     }
-     */
-    if (A->m >= A->n) {
 
+    if (A->m >= A->n) {
 #if defined(CHAMELEON_COPY_DIAG)
         {
             int n = chameleon_min(A->m, A->n);
@@ -390,14 +382,51 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
             Dptr = &D;
         }
 #endif
-
-        subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
-        subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
-
+        /*
+         * compute QR factorization of A
+         */
         chameleon_pzgeqrf_param( 1, A->nt, qrtree, A,
                                  TS, TT, Dptr, sequence, request );
-        chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request );
-        chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+
+        /* Perform the solve part */
+        if ( trans == ChamNoTrans ) {
+            /*
+             * Least-Squares Problem min || A * X - B ||
+             *
+             * B(1:M,1:NRHS) := Q**H * B(1:M,1:NRHS)
+             */
+            chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request );
+
+            /*
+             * B(1:N,1:NRHS) := inv(R) * B(1:N,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
+            chameleon_pztrsm( ChamLeft, ChamUpper, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+        }
+        else {
+            /*
+             * Underdetermined system of equations A**H * X = B
+             *
+             * B(1:N,1:NRHS) := inv(R**H) * B(1:N,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->n, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->n, A->n);
+            chameleon_pztrsm( ChamLeft, ChamUpper, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+
+            /*
+             * B(M+1:N,1:NRHS) = 0
+             */
+            /* TODO: enable subdescriptor with non aligned starting point in laset */
+            /* free(subB); */
+            /* subB = chameleon_desc_submatrix(B, A->n, 0, A->m-A->n, B->n); */
+            /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */
+
+            /*
+             * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS)
+             */
+            chameleon_pzunmqr_param( 0, qrtree, ChamLeft, ChamNoTrans, A, B, TS, TT, Dptr, sequence, request );
+        }
     }
     else {
 #if defined(CHAMELEON_COPY_DIAG)
@@ -407,13 +436,50 @@ int CHAMELEON_zgels_param_Tile_Async( const libhqr_tree_t *qrtree, cham_trans_t
             Dptr = &D;
         }
 #endif
+        /*
+         * Compute LQ factorization of A
+         */
+        chameleon_pzgelqf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request );
 
-        subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
-        subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
+        /* Perform the solve part */
+        if ( trans == ChamNoTrans ) {
+            /*
+             * Underdetermined system of equations A * X = B
+             *
+             * B(1:M,1:NRHS) := inv(L) * B(1:M,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
+            chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
 
-        chameleon_pzgelqf_param( 1, qrtree, A, TS, TT, Dptr, sequence, request );
-        chameleon_pztrsm( ChamLeft, ChamLower, ChamNoTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
-        chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request );
+            /*
+             * B(M+1:N,1:NRHS) = 0
+             */
+            /* TODO: enable subdescriptor with non aligned starting point in laset */
+            /* free(subB); */
+            /* subB = chameleon_desc_submatrix(B, A->m, 0, A->n-A->m, B->n); */
+            /* chameleon_pzlaset( ChamUpperLower, 0., 0., subB, sequence, request ); */
+
+            /*
+             * B(1:N,1:NRHS) := Q(1:N,:)**H * B(1:M,1:NRHS)
+             */
+            chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamConjTrans, A, B, TS, TT, Dptr, sequence, request );
+        }
+        else {
+            /*
+             * Overdetermined system min || A**H * X - B ||
+             *
+             * B(1:N,1:NRHS) := Q * B(1:N,1:NRHS)
+             */
+            chameleon_pzunmlq_param( 0, qrtree, ChamLeft, ChamNoTrans, A, B, TS, TT, Dptr, sequence, request );
+
+            /*
+             * B(1:M,1:NRHS) := inv(L**H) * B(1:M,1:NRHS)
+             */
+            subB = chameleon_desc_submatrix(B, 0, 0, A->m, B->n);
+            subA = chameleon_desc_submatrix(A, 0, 0, A->m, A->m);
+            chameleon_pztrsm( ChamLeft, ChamLower, ChamConjTrans, ChamNonUnit, 1.0, subA, subB, sequence, request );
+        }
     }
 
     free(subA);
diff --git a/compute/zunmlq.c b/compute/zunmlq.c
index bbc4a6a77..07c635a67 100644
--- a/compute/zunmlq.c
+++ b/compute/zunmlq.c
@@ -219,7 +219,6 @@ int CHAMELEON_zunmlq( cham_side_t side, cham_trans_t trans, int M, int N, int K,
  *          Intended usage:
  *          = ChamNoTrans:   no transpose, apply Q;
  *          = ChamConjTrans: conjugate transpose, apply Q^H.
- *          Currently only ChamConjTrans is supported.
  *
  * @param[in] A
  *          Details of the LQ factorization of the original matrix A as returned by CHAMELEON_zgelqf.
diff --git a/compute/zunmqr.c b/compute/zunmqr.c
index 4139f579a..09387b79b 100644
--- a/compute/zunmqr.c
+++ b/compute/zunmqr.c
@@ -220,7 +220,6 @@ int CHAMELEON_zunmqr( cham_side_t side, cham_trans_t trans, int M, int N, int K,
  *          Intended usage:
  *          = ChamNoTrans:   no transpose, apply Q;
  *          = ChamConjTrans: conjugate transpose, apply Q^H.
- *          Currently only ChamConjTrans is supported.
  *
  * @param[in] A
  *          Details of the QR factorization of the original matrix A as returned by CHAMELEON_zgeqrf.
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index df527bbf5..9c837e5b4 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -415,6 +415,7 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence,
                            const CHAM_desc_t *A, int Am, int An, int new_rank )
 {
 #if defined(HAVE_STARPU_MPI_DATA_MIGRATE)
+    int old_rank;
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(A->schedopt);
     starpu_data_handle_t lhandle;
     handle += ((int64_t)(A->lmt) * (int64_t)An + (int64_t)Am);
@@ -424,8 +425,11 @@ void RUNTIME_data_migrate( const RUNTIME_sequence_t *sequence,
         /* Register the data */
         lhandle = RUNTIME_data_getaddr( A, Am, An );
     }
+    old_rank = starpu_mpi_data_get_rank( lhandle );
 
-    starpu_mpi_data_migrate( MPI_COMM_WORLD, lhandle, new_rank );
+    if ( old_rank != new_rank ) {
+        starpu_mpi_data_migrate( MPI_COMM_WORLD, lhandle, new_rank );
+    }
 
     (void)sequence;
 #else
-- 
GitLab