diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c
index d85123145364e34e8b41bcb179b2f78aaa085b87..2bebdfea40f415741c912833ab7c8ae9a1bd7dc6 100644
--- a/compute/pzgelqf_param.c
+++ b/compute/pzgelqf_param.c
@@ -37,14 +37,14 @@
 /**
  *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
  */
-void morse_pzgelqf_param( const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_desc_t *TS, MORSE_desc_t *TT,
+void morse_pzgelqf_param( const libhqr_tree_t *qrtree, MORSE_desc_t *A,
+                          MORSE_desc_t *TS, MORSE_desc_t *TT, MORSE_desc_t *D,
                           MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *D = NULL;
 
     int k, m, n, i, p;
     int K;
@@ -90,14 +90,6 @@ void morse_pzgelqf_param( const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_de
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-#if defined(CHAMELEON_COPY_DIAG)
-    {
-        /* necessary to avoid dependencies between tasks regarding the diag tile */
-        D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc(*D, A->mb, A->nb, A->m, A->n, 0, 0, A->m, A->n, );
-    }
-#endif
-
     K = chameleon_min(A->mt, A->nt);
 
     /* The number of the factorization */
@@ -208,11 +200,5 @@ void morse_pzgelqf_param( const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_de
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(D);
-    free(D);
-#endif
     (void)D;
 }
diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c
index 4a2e3da006b7154e804406ace7ff777da582bc13..2dbe8f83a54c4606c9d68150422a6bae95785f66 100644
--- a/compute/pzunglq_param.c
+++ b/compute/pzunglq_param.c
@@ -38,14 +38,13 @@
  *  Parallel construction of Q using tile V - dynamic scheduling
  */
 void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_desc_t *Q,
-                        MORSE_desc_t *TS, MORSE_desc_t *TT,
-                        MORSE_sequence_t *sequence, MORSE_request_t *request)
+                         MORSE_desc_t *TS, MORSE_desc_t *TT, MORSE_desc_t *D,
+                         MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *D = NULL;
 
     int k, m, n, i, p;
     int K;
@@ -89,12 +88,6 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
 
     K = chameleon_min(A->mt, A->nt);
 
-        /* necessary to avoid dependencies between tasks regarding the diag tile */
-#if defined(CHAMELEON_COPY_DIAG)
-    D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*D, A->mb, A->nb, K*A->mb, A->nb, 0, 0, K*A->mb, A->nb, A->p, A->q);
-#endif
-
     for (k = K-1; k >= 0; k--) {
         RUNTIME_iteration_push(morse, k);
 
@@ -178,11 +171,5 @@ void morse_pzunglq_param(const libhqr_tree_t *qrtree, MORSE_desc_t *A, MORSE_des
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(D);
-    free(D);
-#endif
     (void)D;
 }
diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c
index 3bcec90f391e6024ffab81cb2ac2dcbf6a09b2f5..92a9003244d61ee99a5ca9b46b4d5051f4192bb2 100644
--- a/compute/pzunmlq_param.c
+++ b/compute/pzunmlq_param.c
@@ -39,14 +39,14 @@
  */
 void morse_pzunmlq_param(const libhqr_tree_t *qrtree,
                          MORSE_enum side, MORSE_enum trans,
-                         MORSE_desc_t *A, MORSE_desc_t *B, MORSE_desc_t *TS, MORSE_desc_t *TT,
+                         MORSE_desc_t *A, MORSE_desc_t *B,
+                         MORSE_desc_t *TS, MORSE_desc_t *TT, MORSE_desc_t *D,
                          MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
     MORSE_option_t options;
     size_t ws_worker = 0;
     size_t ws_host = 0;
-    MORSE_desc_t *D = NULL;
 
     int k, m, n, i, p;
     int ldbm, ldak, ldbp;
@@ -88,12 +88,6 @@ void morse_pzunmlq_param(const libhqr_tree_t *qrtree,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
-    /* necessary to avoid dependencies between tasks regarding the diag tile */
-#if defined(CHAMELEON_COPY_DIAG)
-    D = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc_diag(*D, A->mb, A->nb, K*A->mb, A->nb, 0, 0, K*A->mb, A->nb, A->p, A->q);
-#endif
-
     if (side == MorseLeft ) {
         if (trans == MorseNoTrans) {
             /*
@@ -440,11 +434,5 @@ void morse_pzunmlq_param(const libhqr_tree_t *qrtree,
     RUNTIME_options_ws_free(&options);
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
-
-#if defined(CHAMELEON_COPY_DIAG)
-    MORSE_Sequence_Wait(sequence);
-    morse_desc_mat_free(D);
-    free(D);
-#endif
     (void)D;
 }
diff --git a/compute/zgelqf_param.c b/compute/zgelqf_param.c
index 4b893047a5cf81770ae47be7d7ecb452f140d10e..5d07b6eed45be7c9d34abca44597d0b04e083a15 100644
--- a/compute/zgelqf_param.c
+++ b/compute/zgelqf_param.c
@@ -238,6 +238,7 @@ int MORSE_zgelqf_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
                                   MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -281,7 +282,13 @@ int MORSE_zgelqf_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
     if (chameleon_min(M, N) == 0)
         return MORSE_SUCCESS;
 */
-    morse_pzgelqf_param(qrtree, A, TS, TT, sequence, request);
-
+#if defined(CHAMELEON_COPY_DIAG)
+    morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
+    morse_pzgelqf_param(qrtree, A, TS, TT, &D, sequence, request);
+    morse_desc_mat_free(&D);
+#else
+    morse_pzgelqf_param(qrtree, A, TS, TT, NULL, sequence, request);
+#endif
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgelqs_param.c b/compute/zgelqs_param.c
index 71550a71bf9c987c1fdea6174c5a1b1a600455ad..581d741390d84f63abb2683be65456a2bf5baf99 100644
--- a/compute/zgelqs_param.c
+++ b/compute/zgelqs_param.c
@@ -270,6 +270,7 @@ int MORSE_zgelqs_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
     MORSE_desc_t *subB;
     MORSE_desc_t *subA;
     MORSE_context_t *morse;
+    MORSE_desc_t D;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -324,12 +325,18 @@ int MORSE_zgelqs_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
 
     subB = morse_desc_submatrix(B, 0, 0, A->m, B->n);
     subA = morse_desc_submatrix(A, 0, 0, A->m, A->m);
-    morse_pztrsm(MorseLeft, MorseLower, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
+    morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
     free(subA);
     free(subB);
 
-    morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, sequence, request);
-
+#if defined(CHAMELEON_COPY_DIAG)
+    morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
+    morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, &D, sequence, request);
+    morse_desc_mat_free(&D);
+#else
+    morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, NULL, sequence, request);
+#endif
 
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zgels_param.c b/compute/zgels_param.c
index de1a1b80f7da8aeda0d20d819d00df96a2e3c3ee..92f63d817ba370243ed73e57a067572c9f4b705e 100644
--- a/compute/zgels_param.c
+++ b/compute/zgels_param.c
@@ -393,37 +393,45 @@ int MORSE_zgels_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_enum trans,
     if (A->m >= A->n) {
 
 #if defined(CHAMELEON_COPY_DIAG)
-    morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
-    morse_pzgeqrf_param(qrtree, A, TS, TT, &D, sequence, request);
-    morse_pzunmqr_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, &D, sequence, request);
-    morse_desc_mat_free(&D);
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
+        morse_pzgeqrf_param(qrtree, A, TS, TT, &D, sequence, request);
+        morse_pzunmqr_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, &D, sequence, request);
+        morse_desc_mat_free(&D);
 #else
-    morse_pzgeqrf_param(qrtree, A, TS, TT, NULL, sequence, request);
-    morse_pzunmqr_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, NULL, sequence, request);
+        morse_pzgeqrf_param(qrtree, A, TS, TT, NULL, sequence, request);
+        morse_pzunmqr_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, NULL, sequence, request);
 #endif
 
-    subB = morse_desc_submatrix(B, 0, 0, A->n, B->n);
-    subA = morse_desc_submatrix(A, 0, 0, A->n, A->n);
-    morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
-    free(subA);
-    free(subB);
+        subB = morse_desc_submatrix(B, 0, 0, A->n, B->n);
+        subA = morse_desc_submatrix(A, 0, 0, A->n, A->n);
+        morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
+        free(subA);
+        free(subB);
     }
     else {
-        /* subB = morse_desc_submatrix(B, A->m, 0, A->n-A->m, B->n);
-        morse_pztile_zero(subB, sequence, request);
-        free(subB); */
-
-        morse_pzgelqf_param(qrtree, A, TS, TT, sequence, request);
 
+#if defined(CHAMELEON_COPY_DIAG)
+        morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
+        morse_pzgelqf_param(qrtree, A, TS, TT, &D, sequence, request);
         subB = morse_desc_submatrix(B, 0, 0, A->m, B->n);
         subA = morse_desc_submatrix(A, 0, 0, A->m, A->m);
-        morse_pztrsm(MorseLeft, MorseLower, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
+        morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
         free(subA);
         free(subB);
+        morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, &D, sequence, request);
+        morse_desc_mat_free(&D);
+#else
+        morse_pzgelqf_param(qrtree, A, TS, TT, NULL, sequence, request);
+        subB = morse_desc_submatrix(B, 0, 0, A->m, B->n);
+        subA = morse_desc_submatrix(A, 0, 0, A->m, A->m);
+        morse_pztrsm(MorseLeft, MorseUpper, MorseNoTrans, MorseNonUnit, 1.0, subA, subB, sequence, request);
+        free(subA);
+        free(subB);
+        morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, NULL, sequence, request);
+#endif
 
-        morse_pzunmlq_param(qrtree, MorseLeft, MorseConjTrans, A, B, TS, TT, sequence, request);
-        //morse_pzunmlq(MorseLeft, MorseConjTrans, A, B, TS, sequence, request);
     }
+
     (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zunglq_param.c b/compute/zunglq_param.c
index 24294d56b6dac3ff37d39c0a87e7a280df5de39c..c0e13b6df33849e7fe2f91c5bee1bd8b2adc7622 100644
--- a/compute/zunglq_param.c
+++ b/compute/zunglq_param.c
@@ -254,6 +254,7 @@ int MORSE_zunglq_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
                                   MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -302,7 +303,15 @@ int MORSE_zunglq_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_desc_t *A,
     if (chameleon_min(M, N) == 0)
         return MORSE_SUCCESS;
 */
+#if defined(CHAMELEON_COPY_DIAG)
+    morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
     morse_pzlaset(MorseUpperLower, 0., 1., Q, sequence, request);
-    morse_pzunglq_param(qrtree, A, Q, TS, TT, sequence, request);
+    morse_pzunglq_param(qrtree, A, Q, TS, TT, &D, sequence, request);
+    morse_desc_mat_free(&D);
+#else
+    morse_pzlaset(MorseUpperLower, 0., 1., Q, sequence, request);
+    morse_pzunglq_param(qrtree, A, Q, TS, TT, NULL, sequence, request);
+#endif
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/compute/zunmlq_param.c b/compute/zunmlq_param.c
index d009618274fbabe31594063f9a9cf9f23ec78c99..4aac83e45b3d83f8c67bbea2b8a70dc5834ba1a3 100644
--- a/compute/zunmlq_param.c
+++ b/compute/zunmlq_param.c
@@ -310,6 +310,7 @@ int MORSE_zunmlq_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_enum side,
                                   MORSE_sequence_t *sequence, MORSE_request_t *request)
 {
     MORSE_context_t *morse;
+    MORSE_desc_t D;
 
     morse = morse_context_self();
     if (morse == NULL) {
@@ -364,7 +365,14 @@ int MORSE_zunmlq_param_Tile_Async(const libhqr_tree_t *qrtree, MORSE_enum side,
     if (chameleon_min(M, chameleon_min(N, K)) == 0)
         return MORSE_SUCCESS;
 */
-    morse_pzunmlq_param(qrtree, side, trans, A, C, TS, TT, sequence, request);
 
+#if defined(CHAMELEON_COPY_DIAG)
+    morse_zdesc_alloc(D, A->mb, A->nb, A->m, chameleon_min(A->m, A->n), 0, 0, A->m, chameleon_min(A->m, A->n), );
+    morse_pzunmlq_param(qrtree, side, trans, A, C, TS, TT, &D, sequence, request);
+    morse_desc_mat_free(&D);
+#else
+    morse_pzunmlq_param(qrtree, side, trans, A, C, TS, TT, NULL, sequence, request);
+#endif
+    (void)D;
     return MORSE_SUCCESS;
 }
diff --git a/testing/testing_zgels_systolic.c b/testing/testing_zgels_systolic.c
index ef9db84d505a942eaa33acd1a92b2b0582402db6..fce16bfc8fa637f8616d1da1b2475345567ca569 100644
--- a/testing/testing_zgels_systolic.c
+++ b/testing/testing_zgels_systolic.c
@@ -218,10 +218,10 @@ int testing_zgels_systolic(int argc, char **argv)
 
         /* Morse routines */
         MORSE_zgelqf_param(&qrtree, M, N, A2, LDA, TS, TT);
-        MORSE_zunglq(M, N, K, A2, LDA, TS, Q, LDA);
-        // MORSE_zunglq_param(&qrtree, M, N, K, A2, LDA, TS, TT, Q, LDA);
-        //MORSE_zgelqs_param(&qrtree, M, N, NRHS, A2, LDA, TS, TT, B2, LDB);
-        MORSE_zgelqs(M, N, NRHS, A2, LDA, TS, B2, LDB);
+        //MORSE_zunglq(M, N, K, A2, LDA, TS, Q, LDA);
+        MORSE_zunglq_param(&qrtree, M, N, K, A2, LDA, TS, TT, Q, LDA);
+        MORSE_zgelqs_param(&qrtree, M, N, NRHS, A2, LDA, TS, TT, B2, LDB);
+        //MORSE_zgelqs(M, N, NRHS, A2, LDA, TS, B2, LDB);
 
         /* Check the orthogonality, factorization and the solution */
         info_ortho = check_orthogonality(M, N, LDA, Q, eps);
@@ -284,10 +284,10 @@ int testing_zgels_systolic(int argc, char **argv)
 
         MORSE_zgelqf_param(&qrtree, M, N, A2, LDA, TS, TT);
         MORSE_ztrsm(MorseLeft, MorseLower, MorseNoTrans, MorseNonUnit, M, NRHS, 1.0, A2, LDA, B2, LDB);
-        //MORSE_zunglq_param(&qrtree, M, N, K, A2, LDA, TS, TT, Q, LDA);
-        MORSE_zunglq(M, N, K, A2, LDA, TS, Q, LDA);
-        //MORSE_zunmlq_param(&qrtree, MorseLeft, MorseConjTrans, N, NRHS, M, A2, LDA, TS, TT, B2, LDB);
-        MORSE_zunmlq(MorseLeft, MorseConjTrans, N, NRHS, M, A2, LDA, TS, B2, LDB);
+        MORSE_zunglq_param(&qrtree, M, N, K, A2, LDA, TS, TT, Q, LDA);
+        //MORSE_zunglq(M, N, K, A2, LDA, TS, Q, LDA);
+        MORSE_zunmlq_param(&qrtree, MorseLeft, MorseConjTrans, N, NRHS, M, A2, LDA, TS, TT, B2, LDB);
+        //MORSE_zunmlq(MorseLeft, MorseConjTrans, N, NRHS, M, A2, LDA, TS, B2, LDB);
     }
 
     /* Check the orthogonality, factorization and the solution */