diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index a2dbeb4857b3c5b39d41555b35c8edf471c70db4..98312113fb288cd692b363919bc1410210bcdea3 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -32,11 +32,8 @@
 
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel tile LQ factorization - dynamic scheduling
  **/
@@ -77,9 +74,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    /* necessary to use UNMLQ on GPU */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb);
     /* Worker space
      *
      * zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) )
@@ -104,6 +98,10 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tslqt and unmlq tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, (minMT-1)*A->mb, A->nb, 0, 0, (minMT-1)*A->mb, A->nb);
+
     for (k = 0; k < min(A->mt, A->nt); k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -113,20 +111,20 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
             tempkm, tempkn, ib, T->nb,
             A(k, k), ldak,
             T(k, k), T->mb);
-#if defined(CHAMELEON_USE_MAGMA)
         if ( k < (A->mt-1) ) {
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
                 DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseLower, A->mb, A->nb,
                 0., 1.,
                 DIAG(k), A->mb );
-        }
 #endif
+        }
         for (m = k+1; m < A->mt; m++) {
             tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
             ldam = BLKLDD(A, m);
@@ -164,8 +162,6 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index 7304c96671df425597e76b5f1d4bdb85a5fdfa1a..04d389694c0ed92c898150452656d296b893b4b7 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -36,11 +36,8 @@
 #define A(m,n) A,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
+
 /***************************************************************************//**
  *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
  **/
@@ -58,6 +55,7 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     int ldak, ldam;
     int tempkmin, tempkm, tempNn, tempnn, tempmm, tempNRDn;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -78,12 +76,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        /* necessary to use UNMLQ on GPU */
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
     /* Worker space
      *
      * zgelqt = max( A->nb * (ib+1), ib * (ib + A->nb) )
@@ -108,6 +100,11 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->nt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     for (k = 0; k < min(A->mt, A->nt); k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         ldak = BLKLDD(A, k);
@@ -119,20 +116,20 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                 tempkm, tempNn, ib, T->nb,
                 A(k, N), ldak,
                 T(k, N), T->mb);
-#if defined(CHAMELEON_USE_MAGMA)
         if ( k < (A->mt-1) ) {
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, tempkm, tempNn, A->nb,
                 A(k, N), ldak,
                 DIAG(k, N), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseLower, tempkm, tempNn,
                 0., 1.,
                 DIAG(k, N), ldak );
-        }
 #endif
+        }
             for (m = k+1; m < A->mt; m++) {
                 tempmm = m == A->mt-1 ? A->m-m*A->mb : A->mb;
                 ldam = BLKLDD(A, m);
@@ -196,8 +193,6 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 048376d8683ebc13f0b2bd02ef42abc4349a34f9..88f274636efbf2869e24fc07b52f83ed4652d583 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -32,11 +32,8 @@
 
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel tile QR factorization - dynamic scheduling
  **/
@@ -72,9 +69,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    /* necessary to use UNMQR on GPU */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
     /* Worker space
      *
      * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
@@ -99,6 +93,10 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tsqrt and unmqr tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
+
     for (k = 0; k < minMNT; k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -108,20 +106,20 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
             tempkm, tempkn, ib, T->nb,
             A(k, k), ldak,
             T(k, k), T->mb);
-#if defined(CHAMELEON_USE_MAGMA)
         if ( k < (A->nt-1) ) {
             MORSE_TASK_zlacpy(
                 &options,
                 MorseLower, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
                 DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseUpper, A->mb, A->nb,
                 0., 1.,
                 DIAG(k), ldak );
-        }
 #endif
+        }
         for (n = k+1; n < A->nt; n++) {
             tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
             MORSE_TASK_zunmqr(
@@ -158,8 +156,6 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index 56ce1357a221e678f185f78f7e29a9779a4a927d..bbcb6414c636851b6c4ab3ede20bcedbb53fc4cd 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -34,11 +34,7 @@
 #define A(m,n) A,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m), ((n)+A->nt)
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
@@ -57,6 +53,7 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     int ldaM, ldam, ldaMRD;
     int tempkmin, tempkn, tempMm, tempnn, tempmm, tempMRDm;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -77,12 +74,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        int nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
-
     /* Worker space
      *
      * zgeqrt = max( A->nb * (ib+1), ib * (ib + A->nb) )
@@ -107,6 +98,11 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->mt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     K = min(A->mt, A->nt);
     for (k = 0; k < K; k++) {
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -119,20 +115,20 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                 tempMm, tempkn, ib, T->nb,
                 A(M, k), ldaM,
                 T(M, k), T->mb);
-#if defined(CHAMELEON_USE_MAGMA)
             if ( k < (A->nt-1) ) {
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempMm, A->nb, A->nb,
                     A(M, k), ldaM,
                     DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempMm, A->nb,
                     0., 1.,
                     DIAG(M, k), ldaM );
-            }
 #endif
+            }
             for (n = k+1; n < A->nt; n++) {
                 tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
                 MORSE_TASK_zunmqr(
@@ -196,8 +192,6 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c
index 813a59f2ea18d1f561043266a9a9b1d5df9f562a..08b90104ab9d97356d68c654979a2853aa10d4c2 100644
--- a/compute/pzgetrf_incpiv.c
+++ b/compute/pzgetrf_incpiv.c
@@ -33,11 +33,7 @@
 #include "common.h"
 
 #define A(_m_,_n_) A, _m_, _n_
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(_k_) DIAG, _k_, 0
-#else
-#define DIAG(_k_) A, _k_, _k_
-#endif
 #define L(_m_,_n_) L,  _m_,  _n_
 #define IPIV(_m_,_n_) &(IPIV[(int64_t)A->mb*((int64_t)(_m_)+(int64_t)A->mt*(int64_t)(_n_))])
 
@@ -65,9 +61,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
 
     ib = MORSE_IB;
 #if defined(CHAMELEON_USE_MAGMA)
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
-
     h_work_size  = sizeof(MORSE_Complex64_t)*( 2*ib + 2*L->nb )*2*A->mb;
     d_work_size  = sizeof(MORSE_Complex64_t)*(   ib           )*2*A->mb;
 #else
@@ -76,6 +69,10 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
 #endif
     RUNTIME_options_ws_alloc( &options, h_work_size, d_work_size );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
+
     for (k = 0; k < minMNT; k++) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -88,7 +85,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
             IPIV(k, k),
             k == A->mt-1, A->nb*k);
 
-#if defined(CHAMELEON_USE_MAGMA)
         if ( k < (minMNT-1) ) {
             MORSE_TASK_zlacpy(
                 &options,
@@ -96,7 +92,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
                 A(k, k), ldak,
                 DIAG(k), ldak);
         }
-#endif
 
         for (n = k+1; n < A->nt; n++) {
             tempnn = n == A->nt-1 ? A->n-n*A->nb : A->nb;
@@ -137,8 +132,6 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index bdaf41676b82c43bf8c9348307422982a8bf2f5f..8372ddaf02d525ae302582c09b025227c7b9e096 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -33,11 +33,8 @@
 #define A(m,n) A,  m,  n
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
  **/
@@ -77,8 +74,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
     /* Worker space
      *
      * zunmlq = A->nb * ib
@@ -92,6 +87,10 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
+
     for (k = min(A->mt, A->nt)-1; k >= 0; k--) {
         tempAkm  = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempAkn  = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -113,12 +112,12 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                     T(k, n), T->mb);
             }
         }
-#if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlacpy(
             &options,
             MorseUpper, tempkmin, tempkn, A->nb,
             A(k, k), ldak,
             DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlaset(
             &options,
             MorseLower, tempkmin, tempkn,
@@ -141,8 +140,6 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index f158c828561e775012f7f0e6b348514149a0f248..1991b531f6d2d34f5de629c5cc20247ec22d0b85 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -33,11 +33,8 @@
 #define Q(m,n) Q,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
+
 /**
  *  Parallel construction of Q using tile V (application to identity;
  *  reduction Householder) - dynamic scheduling
@@ -58,6 +55,7 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     int ldqm;
     int tempkm, tempkmin, tempNn, tempnn, tempmm, tempNRDn;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -74,12 +72,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        /* necessary to use UNMLQ on GPU */
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
     /* Worker space
      *
      * zunmqr = A->nb * ib
@@ -93,6 +85,11 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->nt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     K = min(A->mt, A->nt);
     for (k = K-1; k >= 0; k--) {
         tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
@@ -138,12 +135,12 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                         T(k, n), T->mb);
                 }
             }
-#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, tempkmin, tempNn, A->nb,
                 A(k, N), ldak,
                 DIAG(k, N), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseLower, tempkmin, tempNn,
@@ -168,8 +165,6 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index 1e287e9fe778da3f7e03238a5b5054706413da3b..a67a27696bc6a44b8ef0c9241cdeaab0fe1b3c90 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -33,11 +33,8 @@
 #define A(m,n) A,  m,  n
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
  **/
@@ -71,8 +68,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     /* Allocation of temporary (scratch) working space */
 #if defined(CHAMELEON_USE_MAGMA)
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
     /* Worker space
      *
      * zunmqr = A->nb * ib
@@ -86,6 +81,10 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
+
     for (k = min(A->mt, A->nt)-1; k >= 0; k--) {
         tempAkm  = k == A->mt-1 ? A->m-k*A->mb : A->mb;
         tempAkn  = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -109,12 +108,12 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                     T(m, k), T->mb);
             }
         }
-#if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlacpy(
             &options,
             MorseLower, tempkm, tempkmin, A->nb,
             A(k, k), ldak,
             DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlaset(
             &options,
             MorseUpper, tempkm, tempkmin,
@@ -136,8 +135,6 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index 036fe35aec813686807afd7b90ae8bd2fac9ae98..c3fc863a3f8115cda1a8d48f2f5b35efc93efac2 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -35,11 +35,7 @@
 #define Q(m,n) Q,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
 
 /**
  *  Parallel construction of Q using tile V (application to identity;
@@ -61,6 +57,7 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     int ldbM, ldbm, ldbMRD;
     int tempkn, tempMm, tempnn, tempmm, tempMRDm, tempkmin;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -77,12 +74,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        int nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
-
     /* Worker space
      *
      * zunmqr = A->nb * ib
@@ -96,6 +87,11 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->mt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     K = min(A->mt, A->nt);
     for (k = K-1; k >= 0; k--) {
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
@@ -145,12 +141,12 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                         T(m, k), T->mb);
                 }
             }
-#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseLower, tempMm, tempkmin, A->nb,
                 A(M, k), ldaM,
                 DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
                 MorseUpper, tempMm, tempkmin,
@@ -174,8 +170,6 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index 6a96c0a5def422ad5ce2a82bfb9e0b4c5c29dcbd..4592ab327288c40ab0cab0370b79a5cc573f55d5 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -34,11 +34,8 @@
 #define A(m,n) A,  m,  n
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel application of Q using tile V - LQ factorization - dynamic scheduling
  **/
@@ -79,9 +76,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    /* necessary to use UNMLQ on GPU */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
     /* Worker space
      *
      * zunmlq = A->nb * ib
@@ -95,6 +89,10 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, minMT*A->mb, A->nb, 0, 0, minMT*A->mb, A->nb);
+
     if (side == MorseLeft ) {
         if (trans == MorseNoTrans) {
             /*
@@ -105,12 +103,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkm,
@@ -168,12 +166,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                             T(k, m), T->mb);
                     }
                 }
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkm,
@@ -217,12 +215,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                             T(k, n), T->mb);
                     }
                 }
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkn,
@@ -250,12 +248,12 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                 tempkn   = k == B->nt -1 ? B->n -k*B->nb : B->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb;
                 ldak = BLKLDD(A, k);
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseLower, tempkmin, tempkn,
@@ -295,8 +293,6 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 277b5d11772ad6326d004ed5ebb0aceda71a2059..e2ad8eb63cf65b601f6235313d063142ba84ccc3 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -35,11 +35,8 @@
 #define B(m,n) B,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
+
 /***************************************************************************//**
  *  Parallel application of Q using tile V - LQ factorization (reduction
  *  Householder) - dynamic scheduling
@@ -60,6 +57,7 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
     int ldbN, ldbm, ldbNRD;
     int tempNn, tempkm, tempnn, tempmm, tempNRDn, tempkmin;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -76,12 +74,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        /* necessary to use UNMLQ on GPU */
-        int nblk = ( A->nt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
     /* Worker space
      *
      * zunmlq = A->nb * ib
@@ -95,6 +87,11 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->nt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     K = min(A->mt, A->nt);
     if (side == MorseLeft ) {
         if (trans == MorseNoTrans) {
@@ -109,12 +106,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                     tempkmin = min(tempkm,tempNn);
                     ldaN = BLKLDD(A, N);
                     ldbN = BLKLDD(B, N);
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -219,12 +216,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                                 T(k, m), T->mb);
                         }
                     }
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -294,12 +291,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                                 T(k, n), T->mb);
                         }
                     }
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -331,12 +328,12 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                     tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
                     tempkmin = min(tempkm,tempNn);
                     ldaN = BLKLDD(A, N);
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldaN,
                         DIAG(k, N), ldaN );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseLower, tempkmin, tempNn,
@@ -397,8 +394,6 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index 3605e86ee76316410a71ecc883a89feaa87f55d9..d7c426415a2edb3fd542de5ba4c31177504a92b3 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -34,11 +34,8 @@
 #define A(m,n) A,  m,  n
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(k) DIAG, k, 0
-#else
-#define DIAG(k) A, k, k
-#endif
+
 /***************************************************************************//**
  *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
  **/
@@ -71,9 +68,6 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    /* necessary to use UNMQR on GPU */
-    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
     /* Worker space
      *
      * zunmqr = A->nb * ib
@@ -87,6 +81,10 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, min(A->m, A->n), A->nb, 0, 0, min(A->m, A->n), A->nb);
+
     if (A->m > A->n) {
         minM  = A->n;
         minMT = A->nt;
@@ -105,12 +103,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkm, tempkmin,
@@ -170,12 +168,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                             T(m, k), T->mb);
                     }
                 }
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkm, tempkmin,
@@ -221,12 +219,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                             T(n, k), T->mb);
                     }
                 }
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkn, tempkmin,
@@ -254,12 +252,12 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                 tempkn   = k == B->nt-1 ? B->n-k*B->nb : B->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
-#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
                     MorseUpper, tempkn, tempkmin,
@@ -301,8 +299,6 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index 0c77fe6be0309d9f4274cc54ecb411c5bd7d92a2..fbfb4496c4d7744164de0c36fd4c3337f7ec5385 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -35,11 +35,8 @@
 #define B(m,n) B,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  ((n)+A->nt)
-#if defined(CHAMELEON_USE_MAGMA)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
-#else
-#define DIAG(m,n) A, (m), (n)
-#endif
+
 /***************************************************************************//**
  *  Parallel application of Q using tile V - QR factorization (reduction
  *  Householder) - dynamic scheduling
@@ -60,6 +57,7 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
     int ldbM, ldbm, ldbMRD;
     int tempMm, tempkn, tempnn, tempmm, tempMRDm, tempkmin;
     int ib;
+    int nblk;
 
     morse = morse_context_self();
     if (sequence->status != MORSE_SUCCESS)
@@ -76,12 +74,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
     ws_worker = A->nb * ib;
 
 #if defined(CHAMELEON_USE_MAGMA)
-    {
-        int nblk = ( A->mt + BS -1 ) / BS;
-        DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
-        morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
-    }
-
     /* Worker space
      *
      * zunmqr = A->nb * ib
@@ -95,6 +87,11 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
 
     RUNTIME_options_ws_alloc( &options, ws_worker, ws_host );
 
+    /* necessary to avoid dependencies between tasks regarding the diag tile */
+    nblk = ( A->mt + BS -1 ) / BS;
+    DIAG = (MORSE_desc_t*)malloc(sizeof(MORSE_desc_t));
+    morse_zdesc_alloc2(*DIAG, A->mb, A->nb, nblk * A->mb, A->nb, 0, 0, nblk * A->mb, A->nb);
+
     K = min(A->mt, A->nt);
     if (side == MorseLeft ) {
         if (trans == MorseConjTrans) {
@@ -108,12 +105,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                     tempkmin = min(tempMm, tempkn);
                     ldaM = BLKLDD(A, M);
                     ldbM = BLKLDD(B, M);
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
@@ -221,12 +218,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                                 T(m, k), T->mb);
                         }
                     }
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
@@ -298,12 +295,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                                 T(n, k), T->mb);
                         }
                     }
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
@@ -334,12 +331,12 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                     tempMm   = M == A->mt-1 ? A->m-M*A->mb : A->mb;
                     tempkmin = min(tempMm, tempkn);
                     ldaM = BLKLDD(A, M);
-#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
                         MorseUpper, tempMm, tempkmin,
@@ -402,8 +399,6 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
     RUNTIME_options_finalize(&options, morse);
     MORSE_TASK_dataflush_all();
 
-#if defined(CHAMELEON_USE_MAGMA)
     morse_desc_mat_free(DIAG);
     free(DIAG);
-#endif
 }