diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index 98312113fb288cd692b363919bc1410210bcdea3..2f66d1abb861d6619795e35e86704b2f68c04c7b 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -32,7 +32,11 @@
 
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel tile LQ factorization - dynamic scheduling
@@ -112,11 +116,13 @@ void morse_pzgelqf(MORSE_desc_t *A, MORSE_desc_t *T,
             A(k, k), ldak,
             T(k, k), T->mb);
         if ( k < (A->mt-1) ) {
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
-                DIAG(k), A->mb );
+                DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index 04d389694c0ed92c898150452656d296b893b4b7..987105ba739e44d6fc54d2cdd1659c239e80b1cf 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -36,7 +36,11 @@
 #define A(m,n) A,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
+#else
+#define DIAG(m,n) A,  (m),  (n)
+#endif
 
 /***************************************************************************//**
  *  Parallel tile LQ factorization (reduction Householder) - dynamic scheduling
@@ -117,11 +121,13 @@ void morse_pzgelqfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                 A(k, N), ldak,
                 T(k, N), T->mb);
         if ( k < (A->mt-1) ) {
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, tempkm, tempNn, A->nb,
                 A(k, N), ldak,
                 DIAG(k, N), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 88f274636efbf2869e24fc07b52f83ed4652d583..4928d45770a24852bb0c290f26b7f0defe74ffc4 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -32,7 +32,11 @@
 
 #define A(m,n) A,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization - dynamic scheduling
@@ -107,11 +111,13 @@ void morse_pzgeqrf(MORSE_desc_t *A, MORSE_desc_t *T,
             A(k, k), ldak,
             T(k, k), T->mb);
         if ( k < (A->nt-1) ) {
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseLower, A->mb, A->nb, A->nb,
                 A(k, k), ldak,
                 DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index bbcb6414c636851b6c4ab3ede20bcedbb53fc4cd..143b66109a03ec230169ba5086080661b6401bc8 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -34,7 +34,11 @@
 #define A(m,n) A,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m), ((n)+A->nt)
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
+#else
+#define DIAG(m,n) A,  (m),  (n)
+#endif
 
 /***************************************************************************//**
  *  Parallel tile QR factorization (reduction Householder) - dynamic scheduling
@@ -116,11 +120,13 @@ void morse_pzgeqrfrh(MORSE_desc_t *A, MORSE_desc_t *T, int BS,
                 A(M, k), ldaM,
                 T(M, k), T->mb);
             if ( k < (A->nt-1) ) {
-                MORSE_TASK_zlacpy(
-                    &options,
-                    MorseLower, tempMm, A->nb, A->nb,
-                    A(M, k), ldaM,
-                    DIAG(M, k), ldaM );
+#if defined(CHAMELEON_COPY_DIAG)
+            MORSE_TASK_zlacpy(
+                &options,
+                MorseLower, tempMm, A->nb, A->nb,
+                A(M, k), ldaM,
+                DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c
index 08b90104ab9d97356d68c654979a2853aa10d4c2..6931064ac80f77380b881991b37c4e247e56d991 100644
--- a/compute/pzgetrf_incpiv.c
+++ b/compute/pzgetrf_incpiv.c
@@ -33,7 +33,11 @@
 #include "common.h"
 
 #define A(_m_,_n_) A, _m_, _n_
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(_k_) DIAG, _k_, 0
+#else
+#define DIAG(_k_) A, _k_, _k_
+#endif
 #define L(_m_,_n_) L,  _m_,  _n_
 #define IPIV(_m_,_n_) &(IPIV[(int64_t)A->mb*((int64_t)(_m_)+(int64_t)A->mt*(int64_t)(_n_))])
 
@@ -86,11 +90,13 @@ void morse_pzgetrf_incpiv(MORSE_desc_t *A, MORSE_desc_t *L, int *IPIV,
             k == A->mt-1, A->nb*k);
 
         if ( k < (minMNT-1) ) {
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpperLower, tempkm, tempkn, A->nb,
                 A(k, k), ldak,
                 DIAG(k), ldak);
+#endif
         }
 
         for (n = k+1; n < A->nt; n++) {
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index 8372ddaf02d525ae302582c09b025227c7b9e096..1d2c7ba0a7bfff03503b975038183b1ca6f3849e 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -33,7 +33,11 @@
 #define A(m,n) A,  m,  n
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
@@ -112,11 +116,13 @@ void morse_pzunglq(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                     T(k, n), T->mb);
             }
         }
+#if defined(CHAMELEON_COPY_DIAG)
         MORSE_TASK_zlacpy(
             &options,
             MorseUpper, tempkmin, tempkn, A->nb,
             A(k, k), ldak,
             DIAG(k), A->mb );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlaset(
             &options,
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index 1991b531f6d2d34f5de629c5cc20247ec22d0b85..96f98f3103a3a74c7cba18c37385c18fda6f9240 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -33,7 +33,11 @@
 #define Q(m,n) Q,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
+#else
+#define DIAG(m,n) A, (m), (n)
+#endif
 
 /**
  *  Parallel construction of Q using tile V (application to identity;
@@ -135,11 +139,13 @@ void morse_pzunglqrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                         T(k, n), T->mb);
                 }
             }
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseUpper, tempkmin, tempNn, A->nb,
                 A(k, N), ldak,
                 DIAG(k, N), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index a67a27696bc6a44b8ef0c9241cdeaab0fe1b3c90..92b429f42eb823bc0b8da32ec382228416b7943f 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -33,7 +33,11 @@
 #define A(m,n) A,  m,  n
 #define Q(m,n) Q,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel construction of Q using tile V (application to identity) - dynamic scheduling
@@ -108,11 +112,13 @@ void morse_pzungqr(MORSE_desc_t *A, MORSE_desc_t *Q, MORSE_desc_t *T,
                     T(m, k), T->mb);
             }
         }
+#if defined(CHAMELEON_COPY_DIAG)
         MORSE_TASK_zlacpy(
             &options,
             MorseLower, tempkm, tempkmin, A->nb,
             A(k, k), ldak,
             DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
         MORSE_TASK_zlaset(
             &options,
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index c3fc863a3f8115cda1a8d48f2f5b35efc93efac2..fa78dea0375627932d1b25a77f6bf5b81231458f 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -35,7 +35,11 @@
 #define Q(m,n) Q,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+(A->nt)
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
+#else
+#define DIAG(m,n) A, (m), (n)
+#endif
 
 /**
  *  Parallel construction of Q using tile V (application to identity;
@@ -141,11 +145,13 @@ void morse_pzungqrrh(MORSE_desc_t *A, MORSE_desc_t *Q,
                         T(m, k), T->mb);
                 }
             }
+#if defined(CHAMELEON_COPY_DIAG)
             MORSE_TASK_zlacpy(
                 &options,
                 MorseLower, tempMm, tempkmin, A->nb,
                 A(M, k), ldaM,
                 DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
             MORSE_TASK_zlaset(
                 &options,
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index 4592ab327288c40ab0cab0370b79a5cc573f55d5..244d6cd4c831aa1de9ce944806a932009ce7bf80 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -34,7 +34,11 @@
 #define A(m,n) A,  m,  n
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - LQ factorization - dynamic scheduling
@@ -103,11 +107,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -166,11 +172,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                             T(k, m), T->mb);
                     }
                 }
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkm, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -215,11 +223,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                             T(k, n), T->mb);
                     }
                 }
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -248,11 +258,13 @@ void morse_pzunmlq(MORSE_enum side, MORSE_enum trans,
                 tempkn   = k == B->nt -1 ? B->n -k*B->nb : B->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->mb : A->mb;
                 ldak = BLKLDD(A, k);
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseUpper, tempkmin, tempkn, A->nb,
                     A(k, k), ldak,
                     DIAG(k), A->mb );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index e2ad8eb63cf65b601f6235313d063142ba84ccc3..a17259921f8371a485c11ae9c5ab41a29e219c10 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -35,7 +35,11 @@
 #define B(m,n) B,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  (n)+A->nt
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((n)/BS), 0
+#else
+#define DIAG(m,n) A, (m), (n)
+#endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - LQ factorization (reduction
@@ -106,11 +110,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                     tempkmin = min(tempkm,tempNn);
                     ldaN = BLKLDD(A, N);
                     ldbN = BLKLDD(B, N);
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -216,11 +222,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                                 T(k, m), T->mb);
                         }
                     }
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -291,11 +299,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                                 T(k, n), T->mb);
                         }
                     }
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldak,
                         DIAG(k, N), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -328,11 +338,13 @@ void morse_pzunmlqrh(MORSE_enum side, MORSE_enum trans,
                     tempNn = N == A->nt-1 ? A->n-N*A->nb : A->nb;
                     tempkmin = min(tempkm,tempNn);
                     ldaN = BLKLDD(A, N);
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseUpper, tempkmin, tempNn, A->nb,
                         A(k, N), ldaN,
                         DIAG(k, N), ldaN );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index d7c426415a2edb3fd542de5ba4c31177504a92b3..b71fef55cc527341325b84f47ed76b05ded32016 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -34,7 +34,11 @@
 #define A(m,n) A,  m,  n
 #define B(m,n) B,  m,  n
 #define T(m,n) T,  m,  n
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(k) DIAG, k, 0
+#else
+#define DIAG(k) A, k, k
+#endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - QR factorization - dynamic scheduling
@@ -103,11 +107,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
                 ldbk = BLKLDD(B, k);
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -168,11 +174,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                             T(m, k), T->mb);
                     }
                 }
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkm, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -219,11 +227,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                             T(n, k), T->mb);
                     }
                 }
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
@@ -252,11 +262,13 @@ void morse_pzunmqr(MORSE_enum side, MORSE_enum trans,
                 tempkn   = k == B->nt-1 ? B->n-k*B->nb : B->nb;
                 tempkmin = k == minMT-1 ? minM-k*A->nb : A->nb;
                 ldak = BLKLDD(A, k);
+#if defined(CHAMELEON_COPY_DIAG)
                 MORSE_TASK_zlacpy(
                     &options,
                     MorseLower, tempkn, tempkmin, A->nb,
                     A(k, k), ldak,
                     DIAG(k), ldak );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                 MORSE_TASK_zlaset(
                     &options,
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index fbfb4496c4d7744164de0c36fd4c3337f7ec5385..5b1dbc3afc6085eb49312decb8056a350663197f 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -35,7 +35,11 @@
 #define B(m,n) B,  (m),  (n)
 #define T(m,n) T,  (m),  (n)
 #define T2(m,n) T,  (m),  ((n)+A->nt)
+#if defined(CHAMELEON_COPY_DIAG)
 #define DIAG(m,n) DIAG, ((m)/BS), 0
+#else
+#define DIAG(m,n) A, (m), (n)
+#endif
 
 /***************************************************************************//**
  *  Parallel application of Q using tile V - QR factorization (reduction
@@ -105,11 +109,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                     tempkmin = min(tempMm, tempkn);
                     ldaM = BLKLDD(A, M);
                     ldbM = BLKLDD(B, M);
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -218,11 +224,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                                 T(m, k), T->mb);
                         }
                     }
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -295,11 +303,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                                 T(n, k), T->mb);
                         }
                     }
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
@@ -331,11 +341,13 @@ void morse_pzunmqrrh(MORSE_enum side, MORSE_enum trans,
                     tempMm   = M == A->mt-1 ? A->m-M*A->mb : A->mb;
                     tempkmin = min(tempMm, tempkn);
                     ldaM = BLKLDD(A, M);
+#if defined(CHAMELEON_COPY_DIAG)
                     MORSE_TASK_zlacpy(
                         &options,
                         MorseLower, tempMm, tempkmin, A->nb,
                         A(M, k), ldaM,
                         DIAG(M, k), ldaM );
+#endif
 #if defined(CHAMELEON_USE_MAGMA)
                     MORSE_TASK_zlaset(
                         &options,
diff --git a/control/common.h b/control/common.h
index 8be1ac62403a74600dc7838e4527ad46b4a22808..193a3dd3950171c39f84ff791ea4763fc1640429 100644
--- a/control/common.h
+++ b/control/common.h
@@ -136,6 +136,12 @@
 #define MORSE_MPI_SIZE    morse->mpi_comm_size
 #endif
 
+/*******************************************************************************
+ *  Activate copy of diagonal tile (StarPU only) for some tile algorithms (pz)
+ **/
+#if defined(CHAMELEON_SCHED_STARPU)
+#define CHAMELEON_COPY_DIAG
+#endif
 
 /*******************************************************************************
  *  IPT internal define