diff --git a/compute/pzgebrd_ge2gb.c b/compute/pzgebrd_ge2gb.c
index 710eb1604a0eaf34b90e858d7ef5a6f85efd9904..b9d0ec1e631d76f7f383786f194e83fb1f73cf4a 100644
--- a/compute/pzgebrd_ge2gb.c
+++ b/compute/pzgebrd_ge2gb.c
@@ -25,7 +25,7 @@
  **/
 #include "control/common.h"
 
-void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
+void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T, MORSE_desc_t D,
                          MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     int k;
@@ -38,6 +38,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
            morse_pzgeqrf(
                morse_desc_submatrix(&A, k*A.mb, k*A.nb, A.m-k*A.mb, tempkn),
                morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
+               morse_desc_submatrix(&D, k*T.mb, k*T.nb, T.m-k*T.mb, tempkn),
                sequence, request);
 
            morse_pzunmqr(
@@ -46,6 +47,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
                morse_desc_submatrix(&A, k*A.mb,     k*A.nb, A.m-k*A.mb, tempkn),
                morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, A.m-k*A.mb, A.n-(k+1)*A.nb),
                morse_desc_submatrix(&T, k*T.mb,     k*T.nb, T.m-k*T.mb, tempkn),
+               morse_desc_submatrix(&D, k*T.mb,     k*T.nb, T.m-k*T.mb, tempkn),
                sequence, request);
 
            if (k+1 < A.nt){
@@ -54,6 +56,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
               morse_pzgelqf(
                   morse_desc_submatrix(&A, k*A.mb, (k+1)*A.nb, tempkm, A.n-(k+1)*A.nb),
                   morse_desc_submatrix(&T, k*T.mb, (k+1)*T.nb, T.mb,   T.n-(k+1)*T.nb),
+                  morse_desc_submatrix(&D, k*T.mb, (k+1)*T.nb, T.mb,   T.n-(k+1)*T.nb),
                   sequence, request);
 
               morse_pzunmlq(
@@ -61,6 +64,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
                   morse_desc_submatrix(&A,     k*A.mb, (k+1)*A.nb, tempkm,         A.n-(k+1)*A.nb),
                   morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
                   morse_desc_submatrix(&T,     k*T.mb, (k+1)*T.nb, T.mb,           T.n-(k+1)*T.nb),
+                  morse_desc_submatrix(&D,     k*T.mb, (k+1)*T.nb, T.mb,           T.n-(k+1)*T.nb),
                   sequence, request);
            }
        }
@@ -73,6 +77,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
            morse_pzgelqf(
                morse_desc_submatrix(&A, k*A.mb, k*A.nb, tempkm, A.n-k*A.nb),
                morse_desc_submatrix(&T, k*T.mb, k*T.nb, T.mb,   T.n-k*T.nb),
+               morse_desc_submatrix(&D, k*T.mb, k*T.nb, T.mb,   T.n-k*T.nb),
                sequence, request);
 
            morse_pzunmlq(
@@ -80,6 +85,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
                morse_desc_submatrix(&A,     k*A.mb, k*A.nb, tempkm,         A.n-k*A.nb),
                morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, A.n-k*A.nb),
                morse_desc_submatrix(&T,     k*T.mb, k*T.nb, T.mb,           T.n-k*T.nb),
+               morse_desc_submatrix(&D,     k*T.mb, k*T.nb, T.mb,           T.n-k*T.nb),
                sequence, request);
 
            if (k+1 < A.mt){
@@ -88,6 +94,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
               morse_pzgeqrf(
                    morse_desc_submatrix(&A, (k+1)*A.mb, k*A.nb, A.m-(k+1)*A.mb, tempkn),
                    morse_desc_submatrix(&T, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
+                   morse_desc_submatrix(&D, (k+1)*T.mb, k*T.nb, T.m-(k+1)*T.mb, tempkn),
                    sequence, request);
 
               morse_pzunmqr(
@@ -95,6 +102,7 @@ void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T,
                   morse_desc_submatrix(&A, (k+1)*A.mb,     k*A.nb, A.m-(k+1)*A.mb, tempkn),
                   morse_desc_submatrix(&A, (k+1)*A.mb, (k+1)*A.nb, A.m-(k+1)*A.mb, A.n-(k+1)*A.nb),
                   morse_desc_submatrix(&T, (k+1)*T.mb,     k*T.nb, T.m-(k+1)*T.mb, tempkn),
+                  morse_desc_submatrix(&D, (k+1)*T.mb,     k*T.nb, T.m-(k+1)*T.mb, tempkn),
                   sequence, request);
            }
        }
diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 36cf19e1b672bb219fdbd2eb6f28f0e221ebeb28..46023eb1ae2049fc09a1aa0ff1d201a89b33eb0c 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -33,22 +33,21 @@
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k)   D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k)   A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel tile LQ factorization - dynamic scheduling
  **/
-void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
+void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldam;
@@ -91,12 +90,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, chameleon_min(A->m, A->n), A->nb, 0, 0, chameleon_min(A->m, A->n), A->nb, A->p, A->q);
-#endif
-
     for (k = 0; k < minMNT; k++) {
         RUNTIME_iteration_push(morse, k);
 
@@ -114,13 +107,13 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
                 &options,
                 MorseUpper, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
-                DIAG(k), ldak );
+                D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseLower, A->mb, A->nb,
                 0., 1.,
-                DIAG(k), ldak );
+                D(k), ldak );
 #endif
 #endif
         }
@@ -131,7 +124,7 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
                 &options,
                 MorseRight, MorseConjTrans,
                 tempmm, tempkn, tempkn, ib, T->nb,
-                DIAG(k), ldak,
+                D(k), ldak,
                 T(k, k), T->mb,
                 A(m, k), ldam);
         }
@@ -162,11 +155,4 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index 0fee71961a000b184ea3723471914f464e085546..0e8afc068eafdd010cb3843a8bb7a1effdc7a88b 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -34,22 +34,21 @@
 #define T(m,n)  T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((n)/BS), 0
+#define D(m,n) D, ((n)/BS), 0
 #else
-#define DIAG(m,n) A,  (m),  (n)
+#define D(m,n) A,  (m),  (n)
 #endif
 
 /*
  *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
  */
-void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
+void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, N, RD;
@@ -89,15 +88,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    {
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
 
     /* The number of the factorization */
@@ -120,13 +110,13 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                     &options,
                     MorseUpper, tempkm, tempNn, A->nb,
                     A(k, N), ldak,
-                    DIAG(k, N), ldak );
+                    D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkm, tempNn,
                     0., 1.,
-                    DIAG(k, N), ldak );
+                    D(k, N), ldak );
 #endif
 #endif
             }
@@ -137,7 +127,7 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                     &options,
                     MorseRight, MorseConjTrans,
                     tempmm, tempNn, tempkmin, ib, T->nb,
-                    DIAG(k, N), ldak,
+                    D(k, N), ldak,
                     T(k, N), T->mb,
                     A(m, N), ldam);
             }
@@ -193,11 +183,4 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 6e061f0ed2fdf05f64649c31c677f55928f7dd51..f9574d9f3fc87c17308fb6b83d1d75d142bd3b39 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -33,22 +33,21 @@
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k)   D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k)   A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization - dynamic scheduling
  **/
-void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
+void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldam;
@@ -86,12 +85,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, chameleon_min(A->m, A->n), A->nb, 0, 0, chameleon_min(A->m, A->n), A->nb, A->p, A->q);
-#endif
-
     for (k = 0; k < minMNT; k++) {
         RUNTIME_iteration_push(morse, k);
 
@@ -109,13 +102,13 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
                 &options,
                 MorseLower, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
-                DIAG(k), ldak );
+                D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseUpper, A->mb, A->nb,
                 0., 1.,
-                DIAG(k), ldak );
+                D(k), ldak );
 #endif
 #endif
         }
@@ -125,7 +118,7 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
                 &options,
                 MorseLeft, MorseConjTrans,
                 tempkm, tempnn, tempkm, ib, T->nb,
-                DIAG(k), ldak,
+                D(k), ldak,
                 T(k, k), T->mb,
                 A(k, n), ldak);
         }
@@ -156,11 +149,4 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index c5b026358376ce0d2e06481bd166e760983aa90c..5355b97b7357926872ecb36e9cc30bafd6f98342 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -35,22 +35,21 @@
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m), ((n)+A->nt)
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((m)/BS), 0
+#define D(m,n) D, ((m)/BS), 0
 #else
-#define DIAG(m,n) A,  (m),  (n)
+#define D(m,n) A,  (m),  (n)
 #endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
  **/
-void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
+void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, M, RD;
@@ -90,15 +89,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    {
-        /* necessary to avoid dependencies between tasks regarding the diag tile */
-        int nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
     for (k = 0; k < K; k++) {
         RUNTIME_iteration_push(morse, k);
@@ -119,13 +109,13 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                 &options,
                 MorseLower, tempMm, A->nb, A->nb,
                 A(M, k), ldaM,
-                DIAG(M, k), ldaM );
+                D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempMm, A->nb,
                     0., 1.,
-                    DIAG(M, k), ldaM );
+                    D(M, k), ldaM );
 #endif
 #endif
             }
@@ -135,7 +125,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                     &options,
                     MorseLeft, MorseConjTrans,
                     tempMm, tempnn, tempkmin, ib, T->nb,
-                    DIAG(M, k), ldaM,
+                    D(M, k), ldaM,
                     T(M, k), T->mb,
                     A(M, n), ldaM);
             }
@@ -193,11 +183,4 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c
index 2173faa09def0a8a31440baf84f563dbce432f6f..2230bd0c20244c7d7eb397670cfab357d464fab4 100644
--- a/compute/pzhetrd_he2hb.c
+++ b/compute/pzhetrd_he2hb.c
@@ -41,12 +41,11 @@
  *  Parallel tile BAND Tridiagonal Reduction - dynamic scheduler
  **/
 void morse_pzhetrd_he2hb(MORSE_enum uplo,
-                         MORSE_desc_t *A, MORSE_desc_t *T,
+                         MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *E,
                          MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
-    MORSE_desc_t *E  = NULL;
     MORSE_desc_t *D  = NULL;
     MORSE_desc_t *AT = NULL;
     size_t ws_worker = 0;
@@ -90,12 +89,6 @@ void morse_pzhetrd_he2hb(MORSE_enum uplo,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* Copy of the extra-diagonal to generate more parallelism by releasing anti-dependencies on UNMQR/TSMQR triangle conflict */
-    E = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*E, A->mb, A->nb, chameleon_min(A->m, A->n), A->nb, 0, 0, chameleon_min(A->m, A->n), A->nb, A->p, A->q);
-#endif
-
     /* Copy of the diagonal tiles to keep the general version of the tile all along the computation */
     D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
     morse_zdesc_alloc_diag(*D, A->mb, A->nb, chameleon_min(A->m, A->n) - A->mb, A->nb, 0, 0, chameleon_min(A->m, A->n) - A->mb, A->nb, A->p, A->q);
@@ -451,10 +444,4 @@ void morse_pzhetrd_he2hb(MORSE_enum uplo,
 
     morse_desc_mat_free(AT);
     free(AT);
-
-#if defined(CHAMELEON_COPY_DIAG)
-    morse_desc_mat_free(E);
-    free(E);
-#endif
-    (void)E;
 }
diff --git a/compute/pztpgqrt.c b/compute/pztpgqrt.c
index 4f8b5e8b0d09ec1489912fd59518b51782511539..27f2c17018934fee64173c65c5a2dd357d2151f8 100644
--- a/compute/pztpgqrt.c
+++ b/compute/pztpgqrt.c
@@ -31,9 +31,9 @@
 #define Q1(m,n) Q1,  m,  n
 #define Q2(m,n) Q2,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k)    D, k, 0
 #else
-#define DIAG(k) V1, k, k
+#define D(k)    V1, k, k
 #endif
 
 /***************************************************************************//**
@@ -43,19 +43,19 @@ void morse_pztpgqrt( int L,
                      MORSE_desc_t *V1, MORSE_desc_t *T1,
                      MORSE_desc_t *V2, MORSE_desc_t *T2,
                      MORSE_desc_t *Q1, MORSE_desc_t *Q2,
+                     MORSE_desc_t *D,
                      MORSE_sequence_t *sequence, MORSE_request_t *request )
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldvk, ldvm;
     int ldqk, ldqm;
     int tempkm, tempkn, tempkk, tempnn, tempmm, templm;
-    int ib, minMT;
+    int ib;
 
     /* Dimension of the first column */
     int maxm  = chameleon_max( Q2->m - L, 1 );
@@ -68,13 +68,6 @@ void morse_pztpgqrt( int L,
     RUNTIME_options_init(&options, morse, sequence, request);
 
     ib = MORSE_IB;
-
-    if (V1->m > V1->n) {
-        minMT = V1->nt;
-    } else {
-        minMT = V1->mt;
-    }
-
     /*
      * ztpmqrt = Q1->nb * ib
      */
@@ -94,12 +87,6 @@ void morse_pztpgqrt( int L,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, V1->mb, V1->nb, minMT*V1->mb, V1->nb, 0, 0, minMT*V1->mb, V1->nb, V1->p, V1->q);
-#endif
-
     for (k = V1->nt-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
 
@@ -152,13 +139,13 @@ void morse_pztpgqrt( int L,
             &options,
             MorseLower, tempkm, tempkk, V1->nb,
             V1(k, k), ldvk,
-            DIAG(k), ldvk );
+            D(k), ldvk );
 #if defined(CHAMELEON_USE_CUDA)
         MORSE_TASK_zlaset(
             &options,
             MorseUpper, tempkm, tempkk,
             0., 1.,
-            DIAG(k), ldvk );
+            D(k), ldvk );
 #endif
 #endif
         for (n = k; n < Q1->nt; n++) {
@@ -167,7 +154,7 @@ void morse_pztpgqrt( int L,
                 &options,
                 MorseLeft, MorseNoTrans,
                 tempkm, tempnn, tempkk, ib, T1->nb,
-                DIAG(k), ldvk,
+                D(k), ldvk,
                 T1(k, k), T1->mb,
                 Q1(k, n), ldqk);
         }
@@ -178,11 +165,4 @@ void morse_pztpgqrt( int L,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG; (void)minMT;
 }
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index 3b50ac59df3a8aaa7ca4e4e5f5bf70ac7c208cb0..9ba7783179b094636cbb3867c47477870d05b3fe 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -34,22 +34,21 @@
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k) D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k) A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
  **/
-void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
+void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldqm;
@@ -91,12 +90,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb, A->p, A->q);
-#endif
-
     for (k = minMT-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
 
@@ -125,13 +118,13 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
             &options,
             MorseUpper, tempkmin, tempkn, A->nb,
             A(k, k), ldak,
-            DIAG(k), ldak );
+            D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
         MORSE_TASK_zlaset(
             &options,
             MorseLower, tempkmin, tempkn,
             0., 1.,
-            DIAG(k), ldak );
+            D(k), ldak );
 #endif
 #endif
         for (m = k; m < Q->mt; m++) {
@@ -141,7 +134,7 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                 &options,
                 MorseRight, MorseNoTrans,
                 tempmm, tempkn, tempkmin, ib, T->nb,
-                DIAG(k), ldak,
+                D(k), ldak,
                 T(k, k), T->mb,
                 Q(m, k), ldqm);
         }
@@ -151,11 +144,4 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index 722613bb426c559496192735af405e1821e0ef8e..a3b360d99d63140869ea3ec81d4300513f3ec231 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -34,9 +34,9 @@
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((n)/BS), 0
+#define D(m,n) D, ((n)/BS), 0
 #else
-#define DIAG(m,n) A, (m), (n)
+#define D(m,n) A, (m), (n)
 #endif
 
 /**
@@ -44,14 +44,13 @@
  *  reduction Householder) - dynamic scheduling
  **/
 void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
-                     MORSE_desc_t *T, int BS,
+                     MORSE_desc_t *T, MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, N, RD, lastRD;
@@ -88,15 +87,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    {
-        /* necessary to avoid dependencies between tasks regarding the diag tile */
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
     for (k = K-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
@@ -149,13 +139,13 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                 &options,
                 MorseUpper, tempkmin, tempNn, A->nb,
                 A(k, N), ldak,
-                DIAG(k, N), ldak );
+                D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseLower, tempkmin, tempNn,
                 0., 1.,
-                DIAG(k, N), ldak );
+                D(k, N), ldak );
 #endif
 #endif
             for (m = k; m < Q->mt; m++) {
@@ -166,7 +156,7 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                     MorseRight, MorseNoTrans,
                     tempmm, tempNn,
                     tempkmin, ib, T->nb,
-                    DIAG(k, N), ldak,
+                    D(k, N), ldak,
                     T(k, N), T->mb,
                     Q(m, N), ldqm);
             }
@@ -177,11 +167,4 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index d5a258ac88b9d91f4776697a9647f3ec758adbab..da851bcc94e392cd8dc4e7001a98ce669c4495b1 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -34,22 +34,21 @@
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k) D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k) A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
  **/
-void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
+void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldqk, ldam, ldqm;
@@ -91,12 +90,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb, A->p, A->q);
-#endif
-
     for (k = minMT-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
 
@@ -127,13 +120,13 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
             &options,
             MorseLower, tempkm, tempkmin, A->nb,
             A(k, k), ldak,
-            DIAG(k), ldak );
+            D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
         MORSE_TASK_zlaset(
             &options,
             MorseUpper, tempkm, tempkmin,
             0., 1.,
-            DIAG(k), ldak );
+            D(k), ldak );
 #endif
 #endif
         for (n = k; n < Q->nt; n++) {
@@ -142,7 +135,7 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                 &options,
                 MorseLeft, MorseNoTrans,
                 tempkm, tempnn, tempkmin, ib, T->nb,
-                DIAG(k), ldak,
+                D(k), ldak,
                 T(k, k), T->mb,
                 Q(k, n), ldqk);
         }
@@ -152,11 +145,4 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index c5d2d9c0a024f5cb9b6399c5c355f356e52499e4..49565edb59298c7393ff862135b2b2129031478d 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -36,9 +36,9 @@
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((m)/BS), 0
+#define D(m,n) D, ((m)/BS), 0
 #else
-#define DIAG(m,n) A, (m), (n)
+#define D(m,n) A, (m), (n)
 #endif
 
 /**
@@ -46,14 +46,13 @@
  *  reduction Householder) - dynamic scheduling
  **/
 void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
-                     MORSE_desc_t *T, int BS,
+                     MORSE_desc_t *T,  MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, M, RD, lastRD;
@@ -90,15 +89,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    {
-        /* necessary to avoid dependencies between tasks regarding the diag tile */
-        int nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
     for (k = K-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
@@ -155,13 +145,13 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                 &options,
                 MorseLower, tempMm, tempkmin, A->nb,
                 A(M, k), ldaM,
-                DIAG(M, k), ldaM );
+                D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseUpper, tempMm, tempkmin,
                 0., 1.,
-                DIAG(M, k), ldaM );
+                D(M, k), ldaM );
 #endif
 #endif
             for (n = k; n < Q->nt; n++) {
@@ -171,7 +161,7 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                     MorseLeft, MorseNoTrans,
                     tempMm, tempnn,
                     tempkmin, ib, T->nb,
-                    DIAG(M, k), ldaM,
+                    D(M, k), ldaM,
                     T(M, k), T->mb,
                     Q(M, n), ldqM);
             }
@@ -182,11 +172,4 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index 94824c3bc7f1a2d0e08d88a097b23af853564034..577ddbfc0eb7bb8052be6d6c26fe89148cfaeb6f 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -35,23 +35,22 @@
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k) D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k) A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - LQ factorization - dynamic scheduling
  **/
 void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
-                   MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
+                   MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldbk, ldbm;
@@ -93,12 +92,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-#if defined(CHAMELEON_COPY_DIAG)
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb, A->p, A->q);
-#endif
-
     if (side == MorseLeft ) {
         if (trans == MorseNoTrans) {
             /*
@@ -116,13 +109,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkm,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (n = 0; n < B->nt; n++) {
@@ -131,7 +124,7 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempkm, tempnn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(k, n), ldbk);
                 }
@@ -185,13 +178,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkm,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (n = 0; n < B->nt; n++) {
@@ -200,7 +193,7 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempkm, tempnn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(k, n), ldbk);
                 }
@@ -240,13 +233,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkn,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (m = 0; m < B->mt; m++) {
@@ -256,7 +249,7 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
@@ -279,13 +272,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkn,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (m = 0; m < B->mt; m++) {
@@ -295,7 +288,7 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
@@ -322,11 +315,4 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 4c637236d1937debbaecef686ddf1c2c878bf91c..e29ba396482cb6bdc7148dc4f142438b5e763025 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -36,9 +36,9 @@
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((n)/BS), 0
+#define D(m,n) D, ((n)/BS), 0
 #else
-#define DIAG(m,n) A, (m), (n)
+#define D(m,n) A, (m), (n)
 #endif
 
 /***************************************************************************//**
@@ -46,14 +46,13 @@
  *  Householder) - dynamic scheduling
  **/
 void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
-                     MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, int BS,
+                     MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, N, RD, lastRD;
@@ -89,15 +88,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    {
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
     if (side == MorseLeft ) {
         if (trans == MorseNoTrans) {
@@ -118,13 +108,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
                         0., 1.,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #endif
 #endif
                     for (n = 0; n < B->nt; n++) {
@@ -134,7 +124,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                             side, trans,
                             tempNn, tempnn,
                             tempkmin, ib, T->nb,
-                            DIAG(k, N), ldak,
+                            D(k, N), ldak,
                             T(k, N), T->mb,
                             B(N, n), ldbN);
                     }
@@ -233,13 +223,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
                         0., 1.,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #endif
 #endif
                     for (n = 0; n < B->nt; n++) {
@@ -249,7 +239,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                             side, trans,
                             tempNn, tempnn,
                             tempkmin, ib, T->nb,
-                            DIAG(k, N), ldak,
+                            D(k, N), ldak,
                             T(k, N), T->mb,
                             B(N, n), ldbN);
                     }
@@ -314,13 +304,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
                         0., 1.,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #endif
 #endif
                     for (m = 0; m < B->mt; m++) {
@@ -331,7 +321,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                             side, trans,
                             tempmm, tempNn,
                             tempkmin, ib, T->nb,
-                            DIAG(k, N), ldak,
+                            D(k, N), ldak,
                             T(k, N), T->mb,
                             B(m, N), ldbm);
                     }
@@ -356,13 +346,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
                         0., 1.,
-                        DIAG(k, N), ldak );
+                        D(k, N), ldak );
 #endif
 #endif
                     for (m = 0; m < B->mt; m++) {
@@ -373,7 +363,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                             side, trans,
                             tempmm, tempNn,
                             tempkmin, ib, T->nb,
-                            DIAG(k, N), ldak,
+                            D(k, N), ldak,
                             T(k, N), T->mb,
                             B(m, N), ldbm);
                     }
@@ -420,11 +410,4 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index cdf4cbdd3e713f2dd38c866726c0e8720593755c..c0eff1c9579002a1514664ad4be9c4cef47354b0 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -35,23 +35,22 @@
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(k) DIAG, k, 0
+#define D(k) D, k, 0
 #else
-#define DIAG(k) A, k, k
+#define D(k) A, k, k
 #endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
  **/
 void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
-                   MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T,
+                   MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D,
                    MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int ldak, ldbk, ldam, ldan, ldbm;
@@ -93,12 +92,6 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-#if defined(CHAMELEON_COPY_DIAG)
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, minMT*A->nb, A->nb, 0, 0, minMT*A->nb, A->nb, A->p, A->q);
-#endif
-
     if (side == MorseLeft ) {
         if (trans == MorseConjTrans) {
             /*
@@ -116,13 +109,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkm, tempkmin,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (n = 0; n < B->nt; n++) {
@@ -131,7 +124,7 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempkm, tempnn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(k, n), ldbk);
                 }
@@ -187,13 +180,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkm, tempkmin,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (n = 0; n < B->nt; n++) {
@@ -202,7 +195,7 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempkm, tempnn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(k, n), ldbk);
                 }
@@ -244,13 +237,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkn, tempkmin,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (m = 0; m < B->mt; m++) {
@@ -260,7 +253,7 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
@@ -283,13 +276,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #if defined(CHAMELEON_USE_CUDA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkn, tempkmin,
                     0., 1.,
-                    DIAG(k), ldak );
+                    D(k), ldak );
 #endif
 #endif
                 for (m = 0; m < B->mt; m++) {
@@ -299,7 +292,7 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                         &options,
                         side, trans,
                         tempmm, tempkn, tempkmin, ib, T->nb,
-                        DIAG(k), ldak,
+                        D(k), ldak,
                         T(k, k), T->mb,
                         B(m, k), ldbm);
                 }
@@ -327,11 +320,4 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index 777ec71ad5a96e12e3870131fdefa24611dec60a..76a6eef59a7a1b5c3c4c7370717a544683c01647 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -36,9 +36,9 @@
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  ((n)+A->nt)
 #if defined(CHAMELEON_COPY_DIAG)
-#define DIAG(m,n) DIAG, ((m)/BS), 0
+#define D(m,n) D, ((m)/BS), 0
 #else
-#define DIAG(m,n) A, (m), (n)
+#define D(m,n) A, (m), (n)
 #endif
 
 /***************************************************************************//**
@@ -46,14 +46,13 @@
  *  Householder) - dynamic scheduling
  **/
 void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
-                     MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, int BS,
+                     MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, int BS,
                      MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *DIAG = NULL;
 
     int k, m, n;
     int K, M, RD, lastRD;
@@ -90,16 +89,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-    {
-        int nblk = ( A->mt + BS -1 ) / BS;
-        nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc_diag(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb, A->p, A->q);
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
     if (side == MorseLeft ) {
         if (trans == MorseConjTrans) {
@@ -120,13 +109,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
                         0., 1.,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #endif
 #endif
                     for (n = 0; n < B->nt; n++) {
@@ -135,7 +124,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                             &options,
                             side, trans,
                             tempMm, tempnn, tempkmin, ib, T->nb,
-                            DIAG(M, k), ldaM,
+                            D(M, k), ldaM,
                             T(M, k), T->mb,
                             B(M, n), ldbM);
                     }
@@ -238,13 +227,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
                         0., 1.,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #endif
 #endif
                     for (n = 0; n < B->nt; n++) {
@@ -254,7 +243,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                             side, trans,
                             tempMm, tempnn,
                             tempkmin, ib, T->nb,
-                            DIAG(M, k), ldaM,
+                            D(M, k), ldaM,
                             T(M, k), T->mb,
                             B(M, n), ldbM);
                     }
@@ -320,13 +309,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
                         0., 1.,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #endif
 #endif
                     for (m = 0; m < B->mt; m++) {
@@ -336,7 +325,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                             &options,
                             side, trans,
                             tempmm, tempMm, tempkmin, ib, T->nb,
-                            DIAG(M, k), ldaM,
+                            D(M, k), ldaM,
                             T(M, k), T->mb,
                             B(m, M), ldbm);
                     }
@@ -361,13 +350,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #if defined(CHAMELEON_USE_CUDA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
                         0., 1.,
-                        DIAG(M, k), ldaM );
+                        D(M, k), ldaM );
 #endif
 #endif
                     for (m = 0; m < B->mt; m++) {
@@ -377,7 +366,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                             &options,
                             side, trans,
                             tempmm, tempMm, tempkmin, ib, T->nb,
-                            DIAG(M, k), ldaM,
+                            D(M, k), ldaM,
                             T(M, k), T->mb,
                             B(m, M), ldbm);
                     }
@@ -426,11 +415,4 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(DIAG);
-    free(DIAG);
-#endif
-    (void)DIAG;
 }
diff --git a/compute/zgelqf.c b/compute/zgelqf.c
index 95fe0544f6fcbd4098e3c44ef98b443f9d711e19..6fb6606b14c249906af6c883978856eb4afd65ca 100644
--- a/compute/zgelqf.c
+++ b/compute/zgelqf.c
@@ -238,6 +238,7 @@ int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -277,12 +278,23 @@ int MORSE_zgelqf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
     if (chameleon_min(M, N) == 0)
         return MORSE_SUCCESS;
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int m = chameleon_min(A->mt, A->nt) * A->mb;
+        morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        Dptr = &D;
+    }
+#endif
+
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzgelqf(A, T, sequence, request);
+        morse_pzgelqf(A, T, Dptr, sequence, request);
     }
     else {
-        morse_pzgelqfrh(A, T, MORSE_RHBLK, sequence, request);
+        morse_pzgelqfrh(A, T, Dptr, MORSE_RHBLK, sequence, request);
     }
-
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgelqs.c b/compute/zgelqs.c
index b998694a910e89f1dd1ca315b35bad2a22bcdf36..1c8186c47914c86502754696f1e243363ec25322 100644
--- a/compute/zgelqs.c
+++ b/compute/zgelqs.c
@@ -266,6 +266,7 @@ int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B,
     MORSE_desc_t *subB;
     MORSE_desc_t *subA;
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -320,12 +321,24 @@ int MORSE_zgelqs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B,
     free(subA);
     free(subB);
 
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int m = chameleon_min(A->mt, A->nt) * A->mb;
+        morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        Dptr = &D;
+    }
+#endif
+
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzunmlq(MorseLeft, MorseConjTrans, A, B, T, sequence, request);
+        morse_pzunmlq(MorseLeft, MorseConjTrans, A, B, T, Dptr, sequence, request);
     }
     else {
-        morse_pzunmlqrh(MorseLeft, MorseConjTrans, A, B, T, MORSE_RHBLK, sequence, request);
+        morse_pzunmlqrh(MorseLeft, MorseConjTrans, A, B, T, Dptr, MORSE_RHBLK, sequence, request);
     }
 
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgels.c b/compute/zgels.c
index 8b5093369e3d088a4359d89a322cd8ddef0d9ae5..53314a3fe1388299402c1672a0fd026dbb8d7fc4 100644
--- a/compute/zgels.c
+++ b/compute/zgels.c
@@ -29,7 +29,7 @@
  **/
 #include "control/common.h"
 
-/***************************************************************************//**
+/**
  *
  * @ingroup MORSE_Complex64_t
  *
@@ -328,6 +328,7 @@ int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A,
     MORSE_desc_t *subA;
     MORSE_desc_t *subB;
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -379,45 +380,65 @@ int MORSE_zgels_Tile_Async(MORSE_enum trans, MORSE_desc_t *A,
     }
      */
     if (A->m >= A->n) {
+
+#if defined(CHAMELEON_COPY_DIAG)
+        {
+            int n = chameleon_min(A->mt, A->nt) * A->nb;
+            morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+            Dptr = &D;
+        }
+#endif
         if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-            morse_pzgeqrf(A, T, sequence, request);
 
-            morse_pzunmqr(MorseLeft, MorseConjTrans, A, B, T, sequence, request);
+            morse_pzgeqrf(A, T, Dptr, sequence, request);
+
+            morse_pzunmqr(MorseLeft, MorseConjTrans, A, B, T, Dptr, sequence, request);
         }
         else {
-            morse_pzgeqrfrh(A, T, MORSE_RHBLK, sequence, request);
+            morse_pzgeqrfrh(A, T, Dptr, MORSE_RHBLK, sequence, request);
 
-            morse_pzunmqrrh(MorseLeft, MorseConjTrans, A, B, T, MORSE_RHBLK, sequence, request);
+            morse_pzunmqrrh(MorseLeft, MorseConjTrans, A, B, T, Dptr, MORSE_RHBLK, sequence, request);
         }
         subB = morse_desc_submatrix(B, 0, 0, A->n, B->n);
         subA = morse_desc_submatrix(A, 0, 0, A->n, A->n);
         morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
-        free(subA);
-        free(subB);
+
     }
     else {
         /* subB = morse_desc_submatrix(B, A->m, 0, A->n-A->m, B->n);
         morse_pztile_zero(subB, sequence, request);
         free(subB); */
-
+#if defined(CHAMELEON_COPY_DIAG)
+        {
+            int m = chameleon_min(A->mt, A->nt) * A->mb;
+            morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+            Dptr = &D;
+        }
+#endif
         if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-            morse_pzgelqf(A, T, sequence, request);
+            morse_pzgelqf(A, T, Dptr, sequence, request);
         }
         else {
-            morse_pzgelqfrh(A, T, MORSE_RHBLK, sequence, request);
+            morse_pzgelqfrh(A, T, Dptr, MORSE_RHBLK, sequence, request);
         }
         subB = morse_desc_submatrix(B, 0, 0, A->m, B->n);
         subA = morse_desc_submatrix(A, 0, 0, A->m, A->m);
         morse_pztrsm(MorseLeft, MorseLower, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
-        free(subA);
-        free(subB);
 
         if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-            morse_pzunmlq(MorseLeft, MorseConjTrans, A, B, T, sequence, request);
+            morse_pzunmlq(MorseLeft, MorseConjTrans, A, B, T, Dptr, sequence, request);
         }
         else {
-            morse_pzunmlqrh(MorseLeft, MorseConjTrans, A, B, T, MORSE_RHBLK, sequence, request);
+            morse_pzunmlqrh(MorseLeft, MorseConjTrans, A, B, T, Dptr, MORSE_RHBLK, sequence, request);
         }
     }
+
+    free(subA);
+    free(subB);
+
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgeqrf.c b/compute/zgeqrf.c
index f17810f5cd0b0ee6a5abf7dc83c2b5e54f5d2b76..de992ffa755e19cac65af0dcb5cc62332ea68a4d 100644
--- a/compute/zgeqrf.c
+++ b/compute/zgeqrf.c
@@ -238,6 +238,7 @@ int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -277,12 +278,23 @@ int MORSE_zgeqrf_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T,
     if (chameleon_min(M, N) == 0)
         return MORSE_SUCCESS;
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = chameleon_min(A->mt, A->nt) * A->nb;
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        Dptr = &D;
+    }
+#endif
+
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzgeqrf(A, T, sequence, request);
+        morse_pzgeqrf(A, T, Dptr, sequence, request);
     }
     else {
-        morse_pzgeqrfrh(A, T, MORSE_RHBLK, sequence, request);
+        morse_pzgeqrfrh(A, T, Dptr, MORSE_RHBLK, sequence, request);
     }
-
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgeqrs.c b/compute/zgeqrs.c
index 360d13ae92960bac2f4396b70abbe06265ca63ad..757568882712e140b66c622f5d461bcaddbbb056 100644
--- a/compute/zgeqrs.c
+++ b/compute/zgeqrs.c
@@ -263,6 +263,7 @@ int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B,
     MORSE_desc_t *subA;
     MORSE_desc_t *subB;
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -307,11 +308,19 @@ int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B,
         return MORSE_SUCCESS;
     }
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = chameleon_min(A->mt, A->nt) * A->nb;
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        Dptr = &D;
+    }
+#endif
+
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzunmqr(MorseLeft, MorseConjTrans, A, B, T, sequence, request);
+        morse_pzunmqr(MorseLeft, MorseConjTrans, A, B, T, Dptr, sequence, request);
     }
     else {
-        morse_pzunmqrrh(MorseLeft, MorseConjTrans, A, B, T, MORSE_RHBLK, sequence, request);
+        morse_pzunmqrrh(MorseLeft, MorseConjTrans, A, B, T, Dptr, MORSE_RHBLK, sequence, request);
     }
 
     subB = morse_desc_submatrix(B, 0, 0, A->n, B->n);
@@ -320,5 +329,9 @@ int MORSE_zgeqrs_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *B,
     free(subA);
     free(subB);
 
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgesvd.c b/compute/zgesvd.c
index f61ddd9b970dba43379b395d10c48b1d4e3298f0..4aea26f51876378bf7435b626237f9c3f31afe1f 100644
--- a/compute/zgesvd.c
+++ b/compute/zgesvd.c
@@ -398,6 +398,7 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
     MORSE_desc_t descT;
     MORSE_desc_t descU, descVT;
     MORSE_desc_t descAB;
+    MORSE_desc_t D, *Dptr = NULL;
     MORSE_desc_t *subA, *subT, *subUVT;
     double *E;
     int M, N, MINMN, NB, LDAB;
@@ -459,9 +460,14 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
     NB    = descA.mb;
     LDAB  = NB + 1;
     uplo  = M >= N ? MorseUpper : MorseLower;
-
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, A->n, 0, 0, A->m, A->n, );
+        Dptr = &D;
+    }
+#endif
     /* Reduction to band */
-    morse_pzgebrd_ge2gb( descA, descT,
+    morse_pzgebrd_ge2gb( descA, descT, D,
                          sequence, request );
 
     /* Allocate band structure */
@@ -556,12 +562,12 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
             subUVT = morse_desc_submatrix(&descU, descU.mb, 0, descU.m-descU.mb, descU.n);
             subT   = morse_desc_submatrix(&descT, descT.mb, 0, descT.m-descT.mb, descT.n-descT.nb);
             morse_pzunmqr( MorseLeft, MorseNoTrans,
-                           subA, subUVT, subT,
+                           subA, subUVT, subT, Dptr,
                            sequence, request );
         }
         else {
             morse_pzunmqr( MorseLeft, MorseNoTrans,
-                           &descA, &descU, &descT,
+                           &descA, &descU, &descT, Dptr,
                            sequence, request );
         }
     }
@@ -569,7 +575,7 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
     if ( jobvt != MorseNoVec ) {
         if ( M < N ){
             morse_pzunmlq( MorseRight, MorseNoTrans,
-                           &descA, &descVT, &descT,
+                           &descA, &descVT, &descT, Dptr,
                            sequence, request );
         }
         else {
@@ -577,7 +583,7 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
             subUVT = morse_desc_submatrix(&descVT, 0, descVT.nb, descVT.m,         descVT.n-descVT.nb);
             subT   = morse_desc_submatrix(&descT,  0, descT.nb,  descT.m-descT.mb, descT.n -descT.nb);
             morse_pzunmlq( MorseRight, MorseNoTrans,
-                           subA, subUVT, subT,
+                           subA, subUVT, subT, Dptr,
                            sequence, request );
         }
     }
@@ -612,5 +618,9 @@ int MORSE_zgesvd_Tile_Async(MORSE_enum jobu, MORSE_enum jobvt,
     if (jobvt != MorseNoVec)
         morse_desc_mat_free( &descVT );
     free(E);
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zheevd.c b/compute/zheevd.c
index 664fb2ab3f2968f77fc3fd173a8d2f8b36708225..671ffc5c16a668ce5c046075bde6b9841eee5dda 100644
--- a/compute/zheevd.c
+++ b/compute/zheevd.c
@@ -335,6 +335,7 @@ int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo,
     MORSE_context_t *morse;
     MORSE_desc_t descA;
     MORSE_desc_t descT;
+    MORSE_desc_t D, *Dptr = NULL;
     MORSE_Complex64_t *Q2;
     int N, NB, status;
     double *E;
@@ -468,13 +469,20 @@ int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo,
                         morse_desc_mat_free(&(descQ2)); morse_desc_mat_free(&(descV)) );
     if (uplo == MorseLower)
     {
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = chameleon_min(A->mt, A->nt) * A->nb;
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        Dptr = &D;
+    }
+#endif
         subA = morse_desc_submatrix(&descA,  descA.mb,  0, descA.m -descA.mb,  descA.n-descA.nb);
         subQ = morse_desc_submatrix(&descQ2, descQ2.mb, 0, descQ2.m-descQ2.mb, descQ2.n        );
         subT = morse_desc_submatrix(&descT,  descT.mb,  0, descT.m -descT.mb,  descT.n-descT.nb);
 
         /* Compute Q2 = Q1 * Q2 */
         morse_pzunmqr( MorseLeft, MorseNoTrans,
-                       subA, subQ, subT,
+                       subA, subQ, subT, Dptr,
                        sequence, request );
 
         /* Compute the final eigenvectors A = (Q1 * Q2) * V */
@@ -485,13 +493,20 @@ int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo,
 
     }
     else {
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int m = chameleon_min(A->mt, A->nt) * A->mb;
+        morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        Dptr = &D;
+    }
+#endif
         subA = morse_desc_submatrix(&descA,  0, descA.nb,  descA.m -descA.mb,  descA.n -descA.nb );
         subQ = morse_desc_submatrix(&descQ2, descQ2.mb, 0, descQ2.m-descQ2.mb, descQ2.n          );
         subT = morse_desc_submatrix(&descT,  0, descT.nb,  descT.m -descT.mb,  descT.n -descT.nb );
 
         /* Compute Q2 = Q1^h * Q2 */
         morse_pzunmlq( MorseLeft, MorseConjTrans,
-                       subA, subQ, subT,
+                       subA, subQ, subT, Dptr,
                        sequence, request );
 
         /* Compute the final eigenvectors A =  (Q1^h * Q2) * V */
@@ -511,5 +526,9 @@ int MORSE_zheevd_Tile_Async(MORSE_enum jobz, MORSE_enum uplo,
     free(V);
 
     free(E);
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zhetrd.c b/compute/zhetrd.c
index b74a90afc880a8a8f044cb0d421e727077e4db53..e3a6179cc2d1f430bdca9ff15308364c7ea27297 100644
--- a/compute/zhetrd.c
+++ b/compute/zhetrd.c
@@ -334,6 +334,7 @@ int MORSE_zhetrd_Tile_Async(MORSE_enum jobz,
     MORSE_desc_t descAB;
     int N, NB, LDAB;
     int status;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -387,9 +388,14 @@ int MORSE_zhetrd_Tile_Async(MORSE_enum jobz,
 
     N  = descA.m;
     NB = descA.mb;
-
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        morse_zdesc_alloc_diag(D, A->mb, A->nb, chameleon_min(A->m, A->n), A->nb, 0, 0, chameleon_min(A->m, A->n), A->nb, A->p, A->q);
+        Dptr = &D;
+    }
+#endif
     /* Reduction to band. On exit, T contains reflectors */
-    morse_pzhetrd_he2hb( uplo, A, T,
+    morse_pzhetrd_he2hb( uplo, A, T, Dptr,
                          sequence, request );
 
     LDAB = NB+1;
@@ -419,7 +425,9 @@ int MORSE_zhetrd_Tile_Async(MORSE_enum jobz,
         morse_error("MORSE_zhetrd_Tile_Async", "LAPACKE_zhbtrd failed");
     }
 #endif /* !defined(CHAMELEON_SIMULATION) */
-
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
     morse_desc_mat_free(&descAB);
     return MORSE_SUCCESS;
 }
diff --git a/compute/ztpgqrt.c b/compute/ztpgqrt.c
index 1cdab39d879bfe7d19355e55a7e60e33a8af37fe..3943a31d8b13e452e163789721c1a7017a1e59f7 100644
--- a/compute/ztpgqrt.c
+++ b/compute/ztpgqrt.c
@@ -341,6 +341,7 @@ int MORSE_ztpgqrt_Tile_Async( int L,
                               MORSE_sequence_t *sequence, MORSE_request_t *request )
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -395,15 +396,29 @@ int MORSE_ztpgqrt_Tile_Async( int L,
         morse_error("MORSE_ztpgqrt_Tile", "Triangular part must be aligned with tiles");
         return morse_request_fail(sequence, request, MORSE_ERR_ILLEGAL_VALUE);
     }
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int minMT;
+        if (V1->m > V1->n) {
+        minMT = V1->nt;
+        } else {
+            minMT = V1->mt;
+        }
+        morse_zdesc_alloc_diag(D, V1->mb, V1->nb, minMT*V1->mb, V1->nb, 0, 0, minMT*V1->mb, V1->nb, V1->p, V1->q);
+        Dptr = &D;
+    }
+#endif
 
     /* if (morse->householder == MORSE_FLAT_HOUSEHOLDER) { */
     morse_pzlaset( MorseUpperLower, 0., 1., Q1, sequence, request );
     morse_pzlaset( MorseUpperLower, 0., 0., Q2, sequence, request );
-    morse_pztpgqrt( L, V1, T1, V2, T2, Q1, Q2, sequence, request );
+    morse_pztpgqrt( L, V1, T1, V2, T2, Q1, Q2, Dptr, sequence, request );
     /* } */
     /* else { */
     /*    morse_pztpgqrtrh(Q1, T, MORSE_RHBLK, sequence, request); */
     /* } */
-
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
     return MORSE_SUCCESS;
 }
diff --git a/compute/zunglq.c b/compute/zunglq.c
index 78865dd0228bf36a3e672f5fcbff6ab8657b535f..0ee4758b652a79c973ae9c54278ca92be62c155c 100644
--- a/compute/zunglq.c
+++ b/compute/zunglq.c
@@ -160,7 +160,7 @@ int MORSE_zunglq(int M, int N, int K,
 /*        morse_ziptile2lap( descQ, Q, NB, NB, LDQ, N,  sequence, &request);*/
 /*        morse_sequence_wait(morse, sequence);*/
 /*    }*/
-        
+
     status = sequence->status;
     morse_sequence_destroy(morse, sequence);
     return status;
@@ -216,8 +216,8 @@ int MORSE_zunglq_Tile(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *Q)
     MORSE_zunglq_Tile_Async(A, T, Q, sequence, &request);
     morse_sequence_wait(morse, sequence);
     RUNTIME_desc_getoncpu(A);
-        RUNTIME_desc_getoncpu(Q);
-    
+    RUNTIME_desc_getoncpu(Q);
+
     status = sequence->status;
     morse_sequence_destroy(morse, sequence);
     return status;
@@ -254,6 +254,7 @@ int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *Q,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -298,13 +299,26 @@ int MORSE_zunglq_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *Q,
     if (chameleon_min(M, N) == 0)
         return MORSE_SUCCESS;
 */
+
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int m = chameleon_min(A->mt, A->nt) * A->mb;
+        morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        Dptr = &D;
+    }
+#endif
+
     morse_pzlaset(MorseUpperLower, 0., 1., Q, sequence, request);
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzunglq(A, Q, T, sequence, request);
+        morse_pzunglq(A, Q, T, Dptr, sequence, request);
     }
     else {
-        morse_pzunglqrh(A, Q, T, MORSE_RHBLK, sequence, request);
+        morse_pzunglqrh(A, Q, T, Dptr, MORSE_RHBLK, sequence, request);
     }
 
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zungqr.c b/compute/zungqr.c
index cb136d4176f92d0ebffcbc451428efe7187ee053..122b1b911adce9a201c93dbfcc62a9c41ef6405a 100644
--- a/compute/zungqr.c
+++ b/compute/zungqr.c
@@ -253,6 +253,7 @@ int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *Q,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -296,13 +297,25 @@ int MORSE_zungqr_Tile_Async(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *Q,
     if (N <= 0)
         return MORSE_SUCCESS;
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = chameleon_min(A->mt, A->nt) * A->nb;
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        Dptr = &D;
+    }
+#endif
+
     morse_pzlaset(MorseUpperLower, 0., 1., Q, sequence, request);
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
-        morse_pzungqr(A, Q, T, sequence, request);
+        morse_pzungqr(A, Q, T, Dptr, sequence, request);
     }
     else {
-        morse_pzungqrrh(A, Q, T, MORSE_RHBLK, sequence, request);
+        morse_pzungqrrh(A, Q, T, Dptr, MORSE_RHBLK, sequence, request);
     }
 
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zunmlq.c b/compute/zunmlq.c
index 1138f4aee3a86d9f12658ede2fc41f97587140a2..33cf72a309ad39382c9c15b17b60caf59b46dcb0 100644
--- a/compute/zunmlq.c
+++ b/compute/zunmlq.c
@@ -311,6 +311,7 @@ int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -361,17 +362,28 @@ int MORSE_zunmlq_Tile_Async(MORSE_enum side, MORSE_enum trans,
     if (chameleon_min(M, chameleon_min(N, K)) == 0)
         return MORSE_SUCCESS;
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int m = chameleon_min(A->mt, A->nt) * A->mb;
+        morse_zdesc_alloc(D, A->mb, A->nb, m, A->n, 0, 0, m, A->n, );
+        Dptr = &D;
+    }
+#endif
+
     if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
         if ( (trans == MorseConjTrans) &&
              (side == MorseLeft) ) {
-            morse_pzunmlq(side, trans, A, C, T, sequence, request);
+            morse_pzunmlq(side, trans, A, C, T, Dptr, sequence, request);
         } else {
-            morse_pzunmlq(side, trans, A, C, T, sequence, request);
+            morse_pzunmlq(side, trans, A, C, T, Dptr, sequence, request);
         }
     }
     else {
-        morse_pzunmlqrh(side, trans, A, C, T, MORSE_RHBLK, sequence, request);
+        morse_pzunmlqrh(side, trans, A, C, T, Dptr, MORSE_RHBLK, sequence, request);
     }
-
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zunmqr.c b/compute/zunmqr.c
index 709947e87ccbe27cf26f30283026ba39b9aea3d6..a32d603674ffd69e6baf2dc2175d923a6b0dd808 100644
--- a/compute/zunmqr.c
+++ b/compute/zunmqr.c
@@ -312,6 +312,7 @@ int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans,
                              MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D, *Dptr = NULL;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -362,18 +363,31 @@ int MORSE_zunmqr_Tile_Async(MORSE_enum side, MORSE_enum trans,
     if (chameleon_min(M, chameleon_min(N, K)) == 0)
         return MORSE_SUCCESS;
 */
-    if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
+
+#if defined(CHAMELEON_COPY_DIAG)
+    {
+        int n = chameleon_min(A->mt, A->nt) * A->nb;
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, n, 0, 0, A->m, n, );
+        Dptr = &D;
+    }
+#endif
+
+   if (morse->householder == MORSE_FLAT_HOUSEHOLDER) {
         if ( (trans == MorseConjTrans) &&
              (side == MorseLeft) ) {
-            morse_pzunmqr(side, trans, A, C, T, sequence, request);
+            morse_pzunmqr(side, trans, A, C, T, Dptr, sequence, request);
         }
         else {
-            morse_pzunmqr(side, trans, A, C, T, sequence, request);
+            morse_pzunmqr(side, trans, A, C, T, Dptr, sequence, request);
         }
     }
     else {
-        morse_pzunmqrrh(side, trans, A, C, T, MORSE_RHBLK, sequence, request);
+        morse_pzunmqrrh(side, trans, A, C, T, Dptr, MORSE_RHBLK, sequence, request);
     }
 
+    if (Dptr != NULL) {
+        morse_desc_mat_free(Dptr);
+    }
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/control/compute_z.h b/control/compute_z.h
index f9608bdb219ae7800a2e1c590fc1d7f29e695c2f..2ac2d5090918260460aad57952c551f59ff4e433 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -89,12 +89,12 @@ void morse_pzbarrier_row2tl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_r
 void morse_pzbarrier_tl2pnl(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzbarrier_tl2row(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgebrd_gb2bd(MORSE_enum uplo, MORSE_desc_t *A, double *D, double *E, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzgebrd_ge2gb(MORSE_desc_t A, MORSE_desc_t T, MORSE_desc_t D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgemm(MORSE_enum transA, MORSE_enum transB, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *D, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgetmi2(MORSE_enum idep, MORSE_enum odep, MORSE_enum storev, int m, int n, int mb, int nb, MORSE_Complex64_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzgetrf_nopiv(MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
@@ -106,7 +106,7 @@ void morse_pzhemm(MORSE_enum side, MORSE_enum uplo, MORSE_Complex64_t alpha, MOR
 void morse_pzherk(MORSE_enum uplo, MORSE_enum trans, double alpha, MORSE_desc_t *A, double beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzher2k(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, double beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 #endif
-void morse_pzhetrd_he2hb(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzhetrd_he2hb(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *T, MORSE_desc_t *E, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzlacpy(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzlag2c(MORSE_desc_t *A, MORSE_desc_t *SB, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzlange(MORSE_enum norm, MORSE_desc_t *A, double *result, MORSE_sequence_t *sequence, MORSE_request_t *request);
@@ -134,7 +134,7 @@ void morse_pzsyrk(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MO
 void morse_pzsyr2k(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_Complex64_t beta, MORSE_desc_t *C, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzsytrf(MORSE_enum uplo, MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztile2band(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *descAB, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pztpgqrt( int L, MORSE_desc_t *V1, MORSE_desc_t *T1, MORSE_desc_t *V2, MORSE_desc_t *T2, MORSE_desc_t *Q1, MORSE_desc_t *Q2, MORSE_sequence_t *sequence, MORSE_request_t *request );
+void morse_pztpgqrt( int L, MORSE_desc_t *V1, MORSE_desc_t *T1, MORSE_desc_t *V2, MORSE_desc_t *T2, MORSE_desc_t *Q1, MORSE_desc_t *Q2, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request );
 void morse_pztpqrt( int L, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request );
 void morse_pztradd(MORSE_enum uplo, MORSE_enum trans, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_Complex64_t beta, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pztrmm(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_enum diag, MORSE_Complex64_t alpha, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_sequence_t *sequence, MORSE_request_t *request);
@@ -144,15 +144,15 @@ void morse_pztrsmrv(MORSE_enum side, MORSE_enum uplo, MORSE_enum transA, MORSE_e
 void morse_pztrtri(MORSE_enum uplo, MORSE_enum diag, MORSE_desc_t *A, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzungbr(MORSE_enum side, MORSE_desc_t *A, MORSE_desc_t *O, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzungbrrh(MORSE_enum side, MORSE_desc_t *A, MORSE_desc_t *O, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D,int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_desc_t *D, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzungtr(MORSE_enum uplo, MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_sequence_t *sequence, MORSE_request_t *request);
-void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunmqr(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunmlq(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, MORSE_sequence_t *sequence, MORSE_request_t *request);
+void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans, MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *T, MORSE_desc_t *D, int BS, MORSE_sequence_t *sequence, MORSE_request_t *request);
 void morse_pzbuild( MORSE_enum uplo, MORSE_desc_t *A, void *user_data, void* user_build_callback, MORSE_sequence_t *sequence, MORSE_request_t *request );
 
 void morse_pzgelqf_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_desc_t *TS, MORSE_desc_t *TT, MORSE_desc_t *D,