From b904e524a38e72f446fc3a6856008e46b4371f07 Mon Sep 17 00:00:00 2001
From: Alycia Lisito <alycia.lisito@inria.fr>
Date: Wed, 6 Apr 2022 16:12:12 +0200
Subject: [PATCH] core: remove nb from args of INSERT_TASK_zlacpy[x]

---
 compute/pzgelqf.c                          |  2 +-
 compute/pzgelqf_param.c                    |  2 +-
 compute/pzgelqfrh.c                        |  2 +-
 compute/pzgemm.c                           | 16 ++---
 compute/pzgenm2.c                          | 14 ++--
 compute/pzgeqrf.c                          |  2 +-
 compute/pzgeqrf_param.c                    |  2 +-
 compute/pzgeqrfrh.c                        |  2 +-
 compute/pzgetrf_incpiv.c                   |  2 +-
 compute/pzhemm.c                           | 16 ++---
 compute/pzhetrd_he2hb.c                    |  6 +-
 compute/pzlacpy.c                          | 10 +--
 compute/pzlange.c                          |  2 +-
 compute/pzlansy.c                          |  2 +-
 compute/pzsymm.c                           | 16 ++---
 compute/pzunglq.c                          |  2 +-
 compute/pzunglq_param.c                    |  2 +-
 compute/pzunglqrh.c                        |  2 +-
 compute/pzungqr.c                          |  2 +-
 compute/pzungqr_param.c                    |  2 +-
 compute/pzungqrrh.c                        |  2 +-
 compute/pzunmlq.c                          |  8 +--
 compute/pzunmlq_param.c                    |  8 +--
 compute/pzunmlqrh.c                        |  8 +--
 compute/pzunmqr.c                          |  8 +--
 compute/pzunmqr_param.c                    |  8 +--
 compute/pzunmqrrh.c                        |  8 +--
 compute/zplghe.c                           | 14 ++--
 coreblas/compute/core_ztile.c              |  2 +-
 coreblas/compute/core_ztile_empty.c        |  2 +-
 coreblas/include/coreblas/coreblas_ztile.h |  2 +-
 include/chameleon/tasks_z.h                |  8 +--
 runtime/openmp/codelets/codelet_zlacpy.c   | 39 ++++++-----
 runtime/parsec/codelets/codelet_zlacpy.c   | 81 +++++++++++++++-------
 runtime/quark/codelets/codelet_zlacpy.c    | 74 ++++++++++++--------
 runtime/starpu/codelets/codelet_zlacpy.c   | 24 +++----
 36 files changed, 224 insertions(+), 178 deletions(-)

diff --git a/compute/pzgelqf.c b/compute/pzgelqf.c
index c72ed710e..32346a6e8 100644
--- a/compute/pzgelqf.c
+++ b/compute/pzgelqf.c
@@ -104,7 +104,7 @@ void chameleon_pzgelqf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
             int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamUpper, tempDkm, tempDkn, A->nb,
+                ChamUpper, tempDkm, tempDkn,
                 A(k, k),
                 D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzgelqf_param.c b/compute/pzgelqf_param.c
index a8c45399f..b69b84b50 100644
--- a/compute/pzgelqf_param.c
+++ b/compute/pzgelqf_param.c
@@ -115,7 +115,7 @@ void chameleon_pzgelqf_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
 
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempDkm, tempDpn, A->nb,
+                    ChamUpper, tempDkm, tempDpn,
                     A(k, p), D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
                 INSERT_TASK_zlaset(
diff --git a/compute/pzgelqfrh.c b/compute/pzgelqfrh.c
index 5d7fae952..d192c978c 100644
--- a/compute/pzgelqfrh.c
+++ b/compute/pzgelqfrh.c
@@ -107,7 +107,7 @@ void chameleon_pzgelqfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
 
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempDkm, tempDNn, A->nb,
+                    ChamUpper, tempDkm, tempDNn,
                     A(k, N),
                     D(k, N) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzgemm.c b/compute/pzgemm.c
index b2a71d511..66a79bcde 100644
--- a/compute/pzgemm.c
+++ b/compute/pzgemm.c
@@ -72,7 +72,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
             if ( transA == ChamNoTrans ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    ChamUpperLower, tempmm, tempkk,
                     A(  m,  k ),
                     WA( m, (k % C->q) + lq ) );
 
@@ -81,7 +81,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                 for ( q=1; q < C->q; q++ ) {
                     INSERT_TASK_zlacpy(
                         options,
-                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        ChamUpperLower, tempmm, tempkk,
                         WA( m, ((k+q-1) % C->q) + lq ),
                         WA( m, ((k+q)   % C->q) + lq ) );
                 }
@@ -89,7 +89,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
             else {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempmm, C->mb,
+                    ChamUpperLower, tempkk, tempmm,
                     A(  k,  m ),
                     WA( m, (k % C->q) + lq ) );
 
@@ -98,7 +98,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                 for ( q=1; q < C->q; q++ ) {
                     INSERT_TASK_zlacpy(
                         options,
-                        ChamUpperLower, tempkk, tempmm, C->mb,
+                        ChamUpperLower, tempkk, tempmm,
                         WA( m, ((k+q-1) % C->q) + lq ),
                         WA( m, ((k+q)   % C->q) + lq ) );
                 }
@@ -112,7 +112,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
             if ( transB == ChamNoTrans ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    ChamUpperLower, tempkk, tempnn,
                     B(   k,              n ),
                     WB( (k % C->p) + lp, n ) );
 
@@ -121,7 +121,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                 for ( p=1; p < C->p; p++ ) {
                     INSERT_TASK_zlacpy(
                         options,
-                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        ChamUpperLower, tempkk, tempnn,
                         WB( ((k+p-1) % C->p) + lp, n ),
                         WB( ((k+p)   % C->p) + lp, n ) );
                 }
@@ -129,7 +129,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
             else {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempnn, tempkk, C->mb,
+                    ChamUpperLower, tempnn, tempkk,
                     B(   n,              k ),
                     WB( (k % C->p) + lp, n ) );
 
@@ -138,7 +138,7 @@ chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_tran
                 for ( p=1; p < C->p; p++ ) {
                     INSERT_TASK_zlacpy(
                         options,
-                        ChamUpperLower, tempnn, tempkk, C->mb,
+                        ChamUpperLower, tempnn, tempkk,
                         WB( ((k+p-1) % C->p) + lp, n ),
                         WB( ((k+p)   % C->p) + lp, n ) );
                 }
diff --git a/compute/pzgenm2.c b/compute/pzgenm2.c
index ec1c9cfc7..c7b209946 100644
--- a/compute/pzgenm2.c
+++ b/compute/pzgenm2.c
@@ -144,7 +144,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
             if ( (m != 0) || (n != 0) ) {
                 INSERT_TASK_dlacpy(
                     &options,
-                    ChamUpperLower, 1, 1, 1,
+                    ChamUpperLower, 1, 1,
                     NRMX(0, 0),
                     NRMX(m, n) );
             }
@@ -200,7 +200,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
 #else
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpperLower, 1, tempnn, tempnn,
+                        ChamUpperLower, 1, tempnn,
                         DROW( 0, n ),
                         X(    0, n ) );
 #endif
@@ -210,7 +210,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
                 for (m = 1; m < A->p; m++) {
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpperLower, 1, tempnn, tempnn,
+                        ChamUpperLower, 1, tempnn,
                         X( 0, n ),
                         X( m, n ) );
                 }
@@ -263,7 +263,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
             for (k = 1; k < A->q; k++) {
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpperLower, tempmm, 1, tempmm,
+                    ChamUpperLower, tempmm, 1,
                     SX( m, 0 ),
                     SX( m, k ) );
             }
@@ -298,7 +298,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
             for (k = 1; k < A->p; k++) {
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpperLower, 1, tempnn, tempnn,
+                    ChamUpperLower, 1, tempnn,
                     X( 0, n ),
                     X( k, n ) );
             }
@@ -340,7 +340,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
             for(n = 1; n < A->q; n++) {
                 INSERT_TASK_dlacpy(
                     &options,
-                    ChamUpperLower, 1, 1, 1,
+                    ChamUpperLower, 1, 1,
                     NRMX( myp, 0 ),
                     NRMX( myp, n ) );
             }
@@ -381,7 +381,7 @@ chameleon_pzgenm2( double tol, const CHAM_desc_t *A, double *result,
             for(m = 1; m < A->p; m++) {
                 INSERT_TASK_dlacpy(
                     &options,
-                    ChamUpperLower, 1, 1, 1,
+                    ChamUpperLower, 1, 1,
                     NRMSX( 0, myq ),
                     NRMSX( m, myq ) );
             }
diff --git a/compute/pzgeqrf.c b/compute/pzgeqrf.c
index 9baf7ae00..a7637392d 100644
--- a/compute/pzgeqrf.c
+++ b/compute/pzgeqrf.c
@@ -98,7 +98,7 @@ void chameleon_pzgeqrf( int genD, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *D
             int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamLower, tempDkm, tempDkn, A->nb,
+                ChamLower, tempDkm, tempDkn,
                 A(k, k),
                 D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzgeqrf_param.c b/compute/pzgeqrf_param.c
index accf33506..a4e08e061 100644
--- a/compute/pzgeqrf_param.c
+++ b/compute/pzgeqrf_param.c
@@ -77,7 +77,7 @@ int chameleon_pzgeqrf_param_step( int genD, cham_uplo_t uplo, int k, int ib,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamLower, tempDpm, tempDkn, A->nb,
+                ChamLower, tempDpm, tempDkn,
                 A(p, k), D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
             INSERT_TASK_zlaset(
diff --git a/compute/pzgeqrfrh.c b/compute/pzgeqrfrh.c
index 8cd96c65c..21c1b43ed 100644
--- a/compute/pzgeqrfrh.c
+++ b/compute/pzgeqrfrh.c
@@ -104,7 +104,7 @@ void chameleon_pzgeqrfrh( int genD, int BS, CHAM_desc_t *A, CHAM_desc_t *T, CHAM
 
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempDMm, tempDkn, A->nb,
+                    ChamLower, tempDMm, tempDkn,
                     A(M, k),
                     D(M, k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzgetrf_incpiv.c b/compute/pzgetrf_incpiv.c
index 69505a5c4..1b61fa1e6 100644
--- a/compute/pzgetrf_incpiv.c
+++ b/compute/pzgetrf_incpiv.c
@@ -97,7 +97,7 @@ void chameleon_pzgetrf_incpiv( CHAM_desc_t *A, CHAM_desc_t *L, CHAM_desc_t *D, i
 #if defined(CHAMELEON_COPY_DIAG)
             INSERT_TASK_zlacpy(
                 &options,
-                ChamUpperLower, tempkm, tempkn, A->nb,
+                ChamUpperLower, tempkm, tempkn,
                 A(k, k),
                 D(k));
 #endif
diff --git a/compute/pzhemm.c b/compute/pzhemm.c
index 175f2ce20..75f1ab66b 100644
--- a/compute/pzhemm.c
+++ b/compute/pzhemm.c
@@ -88,7 +88,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempam, tempak, C->mb,
+                ChamUpperLower, tempam, tempak,
                 A( Am, Ak ),
                 WA( m, (k % C->q) + lq ) );
 
@@ -97,7 +97,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempam, tempak, C->mb,
+                    ChamUpperLower, tempam, tempak,
                     WA( m, ((k+q-1) % C->q) + lq ),
                     WA( m, ((k+q)   % C->q) + lq ) );
             }
@@ -110,7 +110,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempkk, tempnn, C->mb,
+                ChamUpperLower, tempkk, tempnn,
                 B(   k,              n ),
                 WB( (k % C->p) + lp, n ) );
 
@@ -119,7 +119,7 @@ chameleon_pzhemm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    ChamUpperLower, tempkk, tempnn,
                     WB( ((k+p-1) % C->p) + lp, n ),
                     WB( ((k+p)   % C->p) + lp, n ) );
             }
@@ -205,7 +205,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempmm, tempkk, C->mb,
+                ChamUpperLower, tempmm, tempkk,
                 B(  m,  k ),
                 WA( m, (k % C->q) + lq ) );
 
@@ -214,7 +214,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    ChamUpperLower, tempmm, tempkk,
                     WA( m, ((k+q-1) % C->q) + lq ),
                     WA( m, ((k+q)   % C->q) + lq ) );
             }
@@ -245,7 +245,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempak, tempan, C->mb,
+                ChamUpperLower, tempak, tempan,
                 A(  Ak,              An ),
                 WB( (k % C->p) + lp, n  ) );
 
@@ -254,7 +254,7 @@ chameleon_pzhemm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempak, tempan, C->mb,
+                    ChamUpperLower, tempak, tempan,
                     WB( ((k+p-1) % C->p) + lp, n ),
                     WB( ((k+p)   % C->p) + lp, n ) );
             }
diff --git a/compute/pzhetrd_he2hb.c b/compute/pzhetrd_he2hb.c
index 8522f39d6..600e1ec7b 100644
--- a/compute/pzhetrd_he2hb.c
+++ b/compute/pzhetrd_he2hb.c
@@ -120,7 +120,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
 #if defined(CHAMELEON_COPY_DIAG)
            INSERT_TASK_zlacpy(
                &options,
-               ChamLower, tempkm, tempkn, A->nb,
+               ChamLower, tempkm, tempkn,
                A(k+1, k),
                E(k+1, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -271,7 +271,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
 #if defined(CHAMELEON_COPY_DIAG)
            INSERT_TASK_zlacpy(
                &options,
-               ChamUpper, tempkm, tempkn, A->nb,
+               ChamUpper, tempkm, tempkn,
                A(k, k+1),
                E(k, k+1) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -411,7 +411,7 @@ void chameleon_pzhetrd_he2hb(cham_uplo_t uplo,
     for (k = 1; k < A->nt; k++){
         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
         INSERT_TASK_zlacpy( &options,
-                            uplo, tempkn, tempkn, A->mb,
+                            uplo, tempkn, tempkn,
                             D(k), A(k, k));
     }
 
diff --git a/compute/pzlacpy.c b/compute/pzlacpy.c
index 1a2307992..280ff8473 100644
--- a/compute/pzlacpy.c
+++ b/compute/pzlacpy.c
@@ -53,7 +53,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B,
                 INSERT_TASK_zlacpy(
                     &options,
                     ChamUpper,
-                    X, Y, A->mb,
+                    X, Y,
                     A(m, m),
                     B(m, m));
             }
@@ -62,7 +62,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B,
                 INSERT_TASK_zlacpy(
                     &options,
                     ChamUpperLower,
-                    X, Y, A->mb,
+                    X, Y,
                     A(m, n),
                     B(m, n));
             }
@@ -79,7 +79,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B,
                 INSERT_TASK_zlacpy(
                     &options,
                     ChamLower,
-                    X, Y, A->mb,
+                    X, Y,
                     A(m, m),
                     B(m, m));
             }
@@ -88,7 +88,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B,
                 INSERT_TASK_zlacpy(
                     &options,
                     ChamUpperLower,
-                    X, Y, A->mb,
+                    X, Y,
                     A(m, n),
                     B(m, n));
             }
@@ -106,7 +106,7 @@ void chameleon_pzlacpy(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B,
                 INSERT_TASK_zlacpy(
                     &options,
                     ChamUpperLower,
-                    X, Y, A->mb,
+                    X, Y,
                     A(m, n),
                     B(m, n));
             }
diff --git a/compute/pzlange.c b/compute/pzlange.c
index 4acf29aa1..833b0f695 100644
--- a/compute/pzlange.c
+++ b/compute/pzlange.c
@@ -507,7 +507,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
             if ( (m != 0) || (n != 0) ) {
                 INSERT_TASK_dlacpy(
                     &options,
-                    ChamUpperLower, 1, 1, 1,
+                    ChamUpperLower, 1, 1,
                     W( &Welt, 0, 0 ), W( &Welt, m, n ) );
             }
         }
diff --git a/compute/pzlansy.c b/compute/pzlansy.c
index 5bb6b6242..1472f0a69 100644
--- a/compute/pzlansy.c
+++ b/compute/pzlansy.c
@@ -413,7 +413,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
             if ( (m != 0) || (n != 0) ) {
                 INSERT_TASK_dlacpy(
                     &options,
-                    ChamUpperLower, 1, 1, 1,
+                    ChamUpperLower, 1, 1,
                     W( &Welt, 0, 0 ), W( &Welt, m, n ));
             }
         }
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index 27adf0cdf..2f2fa1997 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -88,7 +88,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempam, tempak, C->mb,
+                ChamUpperLower, tempam, tempak,
                 A( Am, Ak ),
                 WA( m, (k % C->q) + lq ) );
 
@@ -97,7 +97,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempam, tempak, C->mb,
+                    ChamUpperLower, tempam, tempak,
                     WA( m, ((k+q-1) % C->q) + lq ),
                     WA( m, ((k+q)   % C->q) + lq ) );
             }
@@ -110,7 +110,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempkk, tempnn, C->mb,
+                ChamUpperLower, tempkk, tempnn,
                 B(   k,              n ),
                 WB( (k % C->p) + lp, n ) );
 
@@ -119,7 +119,7 @@ chameleon_pzsymm_summa_left( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    ChamUpperLower, tempkk, tempnn,
                     WB( ((k+p-1) % C->p) + lp, n ),
                     WB( ((k+p)   % C->p) + lp, n ) );
             }
@@ -205,7 +205,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempmm, tempkk, C->mb,
+                ChamUpperLower, tempmm, tempkk,
                 B(  m,  k ),
                 WA( m, (k % C->q) + lq ) );
 
@@ -214,7 +214,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( q=1; q < C->q; q++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    ChamUpperLower, tempmm, tempkk,
                     WA( m, ((k+q-1) % C->q) + lq ),
                     WA( m, ((k+q)   % C->q) + lq ) );
             }
@@ -245,7 +245,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
 
             INSERT_TASK_zlacpy(
                 options,
-                ChamUpperLower, tempak, tempan, C->mb,
+                ChamUpperLower, tempak, tempan,
                 A(  Ak,              An ),
                 WB( (k % C->p) + lp, n  ) );
 
@@ -254,7 +254,7 @@ chameleon_pzsymm_summa_right( CHAM_context_t *chamctxt, cham_uplo_t uplo,
             for ( p=1; p < C->p; p++ ) {
                 INSERT_TASK_zlacpy(
                     options,
-                    ChamUpperLower, tempak, tempan, C->mb,
+                    ChamUpperLower, tempak, tempan,
                     WB( ((k+p-1) % C->p) + lp, n ),
                     WB( ((k+p)   % C->p) + lp, n ) );
             }
diff --git a/compute/pzunglq.c b/compute/pzunglq.c
index f65f73e60..42f623cb2 100644
--- a/compute/pzunglq.c
+++ b/compute/pzunglq.c
@@ -123,7 +123,7 @@ void chameleon_pzunglq( int genD, CHAM_desc_t *A, CHAM_desc_t *Q, CHAM_desc_t *T
             int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
             INSERT_TASK_zlacpy(
                 &options,
-                ChamUpper, tempkmin, tempDkn, A->nb,
+                ChamUpper, tempkmin, tempDkn,
                 A(k, k),
                 D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunglq_param.c b/compute/pzunglq_param.c
index 1a15e384e..1f1963cee 100644
--- a/compute/pzunglq_param.c
+++ b/compute/pzunglq_param.c
@@ -137,7 +137,7 @@ void chameleon_pzunglq_param( int genD, const libhqr_tree_t *qrtree, CHAM_desc_t
                 int tempDpn = p == D->nt-1 ? D->n-p*D->nb : D->nb;
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkmin, tempDpn, A->nb,
+                    ChamUpper, tempkmin, tempDpn,
                     A(k, p),
                     D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunglqrh.c b/compute/pzunglqrh.c
index 397b5f1e8..f904846cd 100644
--- a/compute/pzunglqrh.c
+++ b/compute/pzunglqrh.c
@@ -150,7 +150,7 @@ void chameleon_pzunglqrh( int genD, int BS,
 
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamUpper, tempkmin, tempDNn, A->nb,
+                    ChamUpper, tempkmin, tempDNn,
                     A(k, N),
                     D(k, N) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzungqr.c b/compute/pzungqr.c
index c7ee3f5dd..cb4b8e967 100644
--- a/compute/pzungqr.c
+++ b/compute/pzungqr.c
@@ -123,7 +123,7 @@ void chameleon_pzungqr( int genD, CHAM_desc_t *A, CHAM_desc_t *Q,
 
             INSERT_TASK_zlacpy(
                 &options,
-                ChamLower, tempDkm, tempkmin, A->nb,
+                ChamLower, tempDkm, tempkmin,
                 A(k, k),
                 D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzungqr_param.c b/compute/pzungqr_param.c
index 16fbefb8d..2f5c24983 100644
--- a/compute/pzungqr_param.c
+++ b/compute/pzungqr_param.c
@@ -114,7 +114,7 @@ void chameleon_pzungqr_param_step( int genD, cham_uplo_t uplo, int k, int ib,
             int tempDmm = m == D->mt-1 ? D->m - m * D->mb : D->mb;
             INSERT_TASK_zlacpy(
                 options,
-                ChamLower, tempDmm, tempkmin, A->nb,
+                ChamLower, tempDmm, tempkmin,
                 A(m, k),
                 D(m, k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzungqrrh.c b/compute/pzungqrrh.c
index d500fe2db..e48b37ba1 100644
--- a/compute/pzungqrrh.c
+++ b/compute/pzungqrrh.c
@@ -150,7 +150,7 @@ void chameleon_pzungqrrh( int genD, int BS,
                 int tempDMm = M == D->mt-1 ? D->m-M*D->mb : D->mb;
                 INSERT_TASK_zlacpy(
                     &options,
-                    ChamLower, tempDMm, tempkmin, A->nb,
+                    ChamLower, tempDMm, tempkmin,
                     A(M, k),
                     D(M, k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmlq.c b/compute/pzunmlq.c
index 9edd667d7..43e695fc6 100644
--- a/compute/pzunmlq.c
+++ b/compute/pzunmlq.c
@@ -107,7 +107,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                     int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempDkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -202,7 +202,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                     int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempDkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -271,7 +271,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
                     int tempDkn = k == D->nt-1 ? D->n-k*D->nb : D->nb;
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempDkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -318,7 +318,7 @@ void chameleon_pzunmlq( int genD, cham_side_t side, cham_trans_t trans,
 
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamUpper, tempkmin, tempDkn, A->nb,
+                        ChamUpper, tempkmin, tempDkn,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmlq_param.c b/compute/pzunmlq_param.c
index b01b7e7a3..3acb7af52 100644
--- a/compute/pzunmlq_param.c
+++ b/compute/pzunmlq_param.c
@@ -106,7 +106,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -237,7 +237,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -331,7 +331,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -383,7 +383,7 @@ void chameleon_pzunmlq_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmlqrh.c b/compute/pzunmlqrh.c
index 8d9ad2ee5..af977675d 100644
--- a/compute/pzunmlqrh.c
+++ b/compute/pzunmlqrh.c
@@ -108,7 +108,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -263,7 +263,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -367,7 +367,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -416,7 +416,7 @@ void chameleon_pzunmlqrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamUpper, tempkmin, tempDpn, A->nb,
+                            ChamUpper, tempkmin, tempDpn,
                             A(k, p),
                             D(k, p) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmqr.c b/compute/pzunmqr.c
index d9f5e1850..17839a541 100644
--- a/compute/pzunmqr.c
+++ b/compute/pzunmqr.c
@@ -108,7 +108,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
 
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamLower, tempDkm, tempkmin, A->nb,
+                        ChamLower, tempDkm, tempkmin,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -203,7 +203,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
 
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamLower, tempDkm, tempkmin, A->nb,
+                        ChamLower, tempDkm, tempkmin,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -273,7 +273,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
 
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamLower, tempDkm, tempkmin, A->nb,
+                        ChamLower, tempDkm, tempkmin,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -320,7 +320,7 @@ void chameleon_pzunmqr( int genD, cham_side_t side, cham_trans_t trans,
 
                     INSERT_TASK_zlacpy(
                         &options,
-                        ChamLower, tempDkm, tempkmin, A->nb,
+                        ChamLower, tempDkm, tempkmin,
                         A(k, k),
                         D(k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmqr_param.c b/compute/pzunmqr_param.c
index ee2d0da7e..e2507120b 100644
--- a/compute/pzunmqr_param.c
+++ b/compute/pzunmqr_param.c
@@ -106,7 +106,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -237,7 +237,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -330,7 +330,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -382,7 +382,7 @@ void chameleon_pzunmqr_param( int genD, const libhqr_tree_t *qrtree,
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/pzunmqrrh.c b/compute/pzunmqrrh.c
index 98a816cd1..6ec5691ce 100644
--- a/compute/pzunmqrrh.c
+++ b/compute/pzunmqrrh.c
@@ -107,7 +107,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -260,7 +260,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -365,7 +365,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
@@ -415,7 +415,7 @@ void chameleon_pzunmqrrh( int genD, int BS, cham_side_t side, cham_trans_t trans
 
                         INSERT_TASK_zlacpy(
                             &options,
-                            ChamLower, tempDpm, tempkmin, A->nb,
+                            ChamLower, tempDpm, tempkmin,
                             A(p, k),
                             D(p, k) );
 #if defined(CHAMELEON_USE_CUDA)
diff --git a/compute/zplghe.c b/compute/zplghe.c
index efbd06911..2a097f637 100644
--- a/compute/zplghe.c
+++ b/compute/zplghe.c
@@ -31,7 +31,8 @@
  *
  * @ingroup CHAMELEON_Complex64_t
  *
- *  CHAMELEON_zplghe - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles.
+ * @brief Generate a random hermitian (positive definite if 'bump' is large
+ * enough) half-matrix by tiles.
  *
  *******************************************************************************
  *
@@ -136,11 +137,12 @@ int CHAMELEON_zplghe( double bump, cham_uplo_t uplo, int N,
  *
  * @ingroup CHAMELEON_Complex64_t_Tile
  *
- *  CHAMELEON_zplghe_Tile - Generate a random hermitian (positive definite if 'bump' is large enough) half-matrix by tiles.
- *  Tile equivalent of CHAMELEON_zplghe().
- *  Operates on matrices stored by tiles.
- *  All matrices are passed through descriptors.
- *  All dimensions are taken from the descriptors.
+ * @brief Generate a random hermitian (positive definite if 'bump' is large
+ * enough) half-matrix by tiles.
+ *
+ * Tile equivalent of CHAMELEON_zplghe().  Operates on matrices stored by
+ * tiles.  All matrices are passed through descriptors.  All dimensions are
+ * taken from the descriptors.
  *
  *******************************************************************************
  *
diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c
index dcec74350..ef9d191ac 100644
--- a/coreblas/compute/core_ztile.c
+++ b/coreblas/compute/core_ztile.c
@@ -368,7 +368,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t
 }
 
 void
-TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB )
+TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB )
 {
     assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
     assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) );
diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c
index 30347d332..3af4ac447 100644
--- a/coreblas/compute/core_ztile_empty.c
+++ b/coreblas/compute/core_ztile_empty.c
@@ -264,7 +264,7 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t
 }
 
 void
-TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB )
+TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB )
 {
     return;
 }
diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h
index 74443b887..8779531ad 100644
--- a/coreblas/include/coreblas/coreblas_ztile.h
+++ b/coreblas/include/coreblas/coreblas_ztile.h
@@ -43,7 +43,7 @@ int  TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_ti
 int  TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq );
 #endif
 void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B );
-void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB );
+void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, const CHAM_tile_t *A, int LDA, int displB, CHAM_tile_t *B, int LDB );
 void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA );
 #if defined(PRECISION_z) || defined(PRECISION_c)
 void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA );
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 85f0e30ee..9a26496f1 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -126,13 +126,13 @@ void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int m, int n, int mb,
+                         cham_uplo_t uplo, int m, int n,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *B, int Bm, int Bn );
 void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
-                          cham_uplo_t uplo, int m, int n, int mb,
-                          int displA, const CHAM_desc_t *A, int Am, int An,
-                          int displB, const CHAM_desc_t *B, int Bm, int Bn );
+                          cham_uplo_t uplo, int m, int n,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb );
 void INSERT_TASK_zlange( const RUNTIME_option_t *options,
                          cham_normtype_t norm, int M, int N, int NB,
                          const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/openmp/codelets/codelet_zlacpy.c b/runtime/openmp/codelets/codelet_zlacpy.c
index 25de93f94..8c5696d6a 100644
--- a/runtime/openmp/codelets/codelet_zlacpy.c
+++ b/runtime/openmp/codelets/codelet_zlacpy.c
@@ -20,10 +20,10 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_ztile.h"
 
-void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
-                          cham_uplo_t uplo, int m, int n, int nb,
-                          int displA, const CHAM_desc_t *A, int Am, int An,
-                          int displB, const CHAM_desc_t *B, int Bm, int Bn )
+void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, int m, int n,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
 {
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
     CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
@@ -31,24 +31,29 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
     assert( tileA->format & CHAMELEON_TILE_FULLRANK );
     assert( tileB->format & CHAMELEON_TILE_FULLRANK );
 
-#pragma omp task firstprivate( uplo, m, n, displA, tileA, displB, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] )
+#pragma omp task firstprivate( uplo, m, n, tileA, tileB ) depend( in:tileA[0] ) depend( inout:tileB[0] )
     {
-        CHAMELEON_Complex64_t *A = tileA->mat;
-        CHAMELEON_Complex64_t *B = tileB->mat;
-
-        CORE_zlacpy( uplo, m, n, A + displA, tileA->ld, B + displB, tileB->ld );
+        TCORE_zlacpy( uplo, m, n, tileA, tileB );
     }
 
     (void)options;
-    (void)nb;
 }
 
-void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int m, int n, int nb,
-                         const CHAM_desc_t *A, int Am, int An,
-                         const CHAM_desc_t *B, int Bm, int Bn )
+void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
+                          cham_uplo_t uplo, int m, int n,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
-    INSERT_TASK_zlacpyx( options, uplo, m, n, nb,
-                         0, A, Am, An,
-                         0, B, Bm, Bn );
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+#pragma omp task firstprivate( uplo, m, n, displA, tileA, lda, displB, tileB, ldb ) depend( in:tileA[0] ) depend( inout:tileB[0] )
+    {
+        TCORE_zlacpyx( uplo, m, n, displA, tileA, lda, displB, tileB, ldb );
+    }
+
+    (void)options;
 }
diff --git a/runtime/parsec/codelets/codelet_zlacpy.c b/runtime/parsec/codelets/codelet_zlacpy.c
index 5c047798d..65b9bc054 100644
--- a/runtime/parsec/codelets/codelet_zlacpy.c
+++ b/runtime/parsec/codelets/codelet_zlacpy.c
@@ -22,6 +22,48 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_z.h"
 
+static inline int
+CORE_zlacpy_parsec( parsec_execution_stream_t *context,
+                    parsec_task_t             *this_task )
+{
+    cham_uplo_t uplo;
+    int M;
+    int N;
+    CHAMELEON_Complex64_t *A;
+    int LDA;
+    CHAMELEON_Complex64_t *B;
+    int LDB;
+
+    parsec_dtd_unpack_args(
+        this_task, &uplo, &M, &N, &A, &LDA, &B, &LDB );
+
+    CORE_zlacpy( uplo, M, N, A, LDA, B, LDB );
+
+    (void)context;
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, int m, int n,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zlacpy_parsec, options->priority, "lacpy",
+        sizeof(cham_uplo_t), &uplo,        VALUE,
+        sizeof(int),         &m,           VALUE,
+        sizeof(int),         &n,           VALUE,
+        PASSED_BY_REF,       RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &(tileA->ld), VALUE,
+        PASSED_BY_REF,       RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY,
+        sizeof(int),         &(tileB->ld), VALUE,
+        PARSEC_DTD_ARG_END );
+}
+
 static inline int
 CORE_zlacpyx_parsec( parsec_execution_stream_t *context,
                     parsec_task_t             *this_task )
@@ -39,42 +81,29 @@ CORE_zlacpyx_parsec( parsec_execution_stream_t *context,
     parsec_dtd_unpack_args(
         this_task, &uplo, &M, &N, &displA, &A, &LDA, &displB, &B, &LDB );
 
-    CORE_zlacpy( uplo, M, N, A + (displA), LDA, B + (displB), LDB );
+    CORE_zlacpy( uplo, M, N, A + displA, LDA, B + displB, LDB );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
 void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
-                          cham_uplo_t uplo, int m, int n, int nb,
-                          int displA, const CHAM_desc_t *A, int Am, int An,
-                          int displB, const CHAM_desc_t *B, int Bm, int Bn )
+                          cham_uplo_t uplo, int m, int n,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
-    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
 
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zlacpyx_parsec, options->priority, "lacpy",
-        sizeof(cham_uplo_t),    &uplo,                      VALUE,
-        sizeof(int),           &m,                         VALUE,
-        sizeof(int),           &n,                         VALUE,
-        sizeof(int),           &displA,                    VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
-        sizeof(int), &(tileA->ld), VALUE,
-        sizeof(int),           &displB,                    VALUE,
-        PASSED_BY_REF,         RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY,
-        sizeof(int), &(tileB->ld), VALUE,
+        sizeof(cham_uplo_t), &uplo,   VALUE,
+        sizeof(int),         &m,      VALUE,
+        sizeof(int),         &n,      VALUE,
+        sizeof(int),         &displA, VALUE,
+        PASSED_BY_REF,       RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &lda,    VALUE,
+        sizeof(int),         &displB, VALUE,
+        PASSED_BY_REF,       RTBLKADDR( B, CHAMELEON_Complex64_t, Bm, Bn ), chameleon_parsec_get_arena_index( B ) | OUTPUT | AFFINITY,
+        sizeof(int),         &ldb,    VALUE,
         PARSEC_DTD_ARG_END );
-    (void)nb;
-}
-
-void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int m, int n, int nb,
-                         const CHAM_desc_t *A, int Am, int An,
-                         const CHAM_desc_t *B, int Bm, int Bn )
-{
-    INSERT_TASK_zlacpyx( options, uplo, m, n, nb,
-                         0, A, Am, An,
-                         0, B, Bm, Bn );
 }
diff --git a/runtime/quark/codelets/codelet_zlacpy.c b/runtime/quark/codelets/codelet_zlacpy.c
index 11992d320..90a9ae5fb 100644
--- a/runtime/quark/codelets/codelet_zlacpy.c
+++ b/runtime/quark/codelets/codelet_zlacpy.c
@@ -29,50 +29,66 @@
 static inline void CORE_zlacpy_quark(Quark *quark)
 {
     cham_uplo_t uplo;
-    int M;
-    int N;
-    int displA;
-    CHAM_tile_t *tileA;
-    CHAMELEON_Complex64_t *A;
-    int displB;
-    CHAM_tile_t *tileB;
-    CHAMELEON_Complex64_t *B;
+    int M, N;
+    int LDA, LDB;
+    CHAM_tile_t *tileA, *tileB;
 
-    quark_unpack_args_7(quark, uplo, M, N, displA, tileA, displB, tileB);
+    quark_unpack_args_5(quark, uplo, M, N, tileA, tileB);
 
     assert( tileA->format & CHAMELEON_TILE_FULLRANK );
     assert( tileB->format & CHAMELEON_TILE_FULLRANK );
 
-    A = tileA->mat;
-    B = tileB->mat;
-    CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld );
+    TCORE_zlacpy( uplo, M, N, tileA, tileB );
 }
 
-void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
-                          cham_uplo_t uplo, int m, int n, int nb,
-                          int displA, const CHAM_desc_t *A, int Am, int An,
-                          int displB, const CHAM_desc_t *B, int Bm, int Bn )
+void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, int m, int n,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     DAG_CORE_LACPY;
     QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &uplo,   VALUE,
-        sizeof(int),                     &m,      VALUE,
-        sizeof(int),                     &n,      VALUE,
-        sizeof(int),                     &displA, VALUE,
+        sizeof(int),   &uplo,   VALUE,
+        sizeof(int),   &m,      VALUE,
+        sizeof(int),   &n,      VALUE,
         sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),             INPUT,
-        sizeof(int),                     &displB, VALUE,
         sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             OUTPUT,
         0);
-    (void)nb;
 }
 
-void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int m, int n, int nb,
-                         const CHAM_desc_t *A, int Am, int An,
-                         const CHAM_desc_t *B, int Bm, int Bn )
+static inline void CORE_zlacpyx_quark(Quark *quark)
 {
-    INSERT_TASK_zlacpyx( options, uplo, m, n, nb,
-                         0, A, Am, An,
-                         0, B, Bm, Bn );
+    cham_uplo_t uplo;
+    int M, N;
+    int displA, displB;
+    int LDA, LDB;
+    CHAM_tile_t *tileA, *tileB;
+
+    quark_unpack_args_9(quark, uplo, M, N, displA, tileA, LDA, displB, tileB, LDB);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+    TCORE_zlacpyx( uplo, M, N, displA, tileA, LDA, displB, tileB, LDB );
+}
+
+void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
+                          cham_uplo_t uplo, int m, int n,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_LACPY;
+    QUARK_Insert_Task(opt->quark, CORE_zlacpy_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),   &uplo,   VALUE,
+        sizeof(int),   &m,      VALUE,
+        sizeof(int),   &n,      VALUE,
+        sizeof(int),   &displA, VALUE,
+        sizeof(void*), RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),             INPUT,
+        sizeof(int),   &lda,    VALUE,
+        sizeof(int),   &displB, VALUE,
+        sizeof(void*), RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),             OUTPUT,
+        sizeof(int),   &ldb,    VALUE,
+        0);
 }
diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c
index 7302ec42c..af49ba0c9 100644
--- a/runtime/starpu/codelets/codelet_zlacpy.c
+++ b/runtime/starpu/codelets/codelet_zlacpy.c
@@ -52,9 +52,7 @@ cl_zlacpy_cpu_func(void *descr[], void *cl_arg)
 
     assert( clargs->displA == 0 );
     assert( clargs->displB == 0 );
-    CHAMELEON_Complex64_t *A = tileA->mat;
-    CHAMELEON_Complex64_t *B = tileB->mat;
-    // CORE_zlacpy( clargs->uplo, clargs->m, clargs->n, A + clargs->displA, tileA->ld, B + clargs->displB, tileB->ld );
+
     TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB );
 }
 
@@ -68,8 +66,8 @@ cl_zlacpyx_cpu_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, clargs->displB,
-                   tileA, clargs->lda, tileB, clargs->ldb );
+    TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA,
+                   tileA, clargs->lda, clargs->displB, tileB, clargs->ldb );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -80,9 +78,9 @@ CODELETS_CPU( zlacpy,  cl_zlacpy_cpu_func  )
 CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func )
 
 void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
-                          cham_uplo_t uplo, int m, int n, int nb,
-                          int displA, const CHAM_desc_t *A, int Am, int An,
-                          int displB, const CHAM_desc_t *B, int Bm, int Bn )
+                          cham_uplo_t uplo, int m, int n,
+                          int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                          int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb )
 {
     struct cl_zlacpy_args_s *clargs = NULL;
     void (*callback)(void*);
@@ -105,8 +103,8 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
         clargs->displB = displB;
         clargs->tileA  = A->get_blktile( A, Am, An );
         clargs->tileB  = B->get_blktile( B, Bm, Bn );
-        clargs->lda    = clargs->tileA->ld;
-        clargs->ldb    = clargs->tileB->ld;
+        clargs->lda    = lda;
+        clargs->ldb    = ldb;
     }
 
     /* Callback fro profiling information */
@@ -129,12 +127,10 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
 #endif
 
         0 );
-
-    (void)nb;
 }
 
 void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int m, int n, int nb,
+                         cham_uplo_t uplo, int m, int n,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *B, int Bm, int Bn )
 {
@@ -183,6 +179,4 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
 #endif
 
         0 );
-
-    (void)nb;
 }
-- 
GitLab