From a4f5668090d6fee465dc7ef3891d42b74c77d581 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 22 Jan 2019 11:29:10 +0100
Subject: [PATCH] Fix distributed norms

---
 compute/pzlange.c                        | 34 +++++++++-------
 compute/pzlansy.c                        |  5 +--
 control/common.h                         |  3 --
 include/chameleon/struct.h               |  2 -
 runtime/openmp/codelets/codelet_zplssq.c | 18 ++++-----
 runtime/parsec/codelets/codelet_zplssq.c | 17 ++++----
 runtime/quark/codelets/codelet_zplssq.c  | 17 ++++----
 runtime/starpu/codelets/codelet_zgeadd.c |  4 +-
 runtime/starpu/codelets/codelet_zlacpy.c | 20 +++++-----
 runtime/starpu/codelets/codelet_zlag2c.c | 33 +++++++--------
 runtime/starpu/codelets/codelet_zlange.c | 39 +++++++++---------
 runtime/starpu/codelets/codelet_zplssq.c | 51 +++++++++++++-----------
 runtime/starpu/codelets/codelet_ztrasm.c |  4 +-
 testing/testing_zauxiliary.c             |  1 +
 testing/testing_zlange.c                 |  4 +-
 15 files changed, 132 insertions(+), 120 deletions(-)

diff --git a/compute/pzlange.c b/compute/pzlange.c
index ce1665743..fc2ecfe67 100644
--- a/compute/pzlange.c
+++ b/compute/pzlange.c
@@ -32,7 +32,7 @@
 static inline void
 chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
                        CHAM_desc_t *Wcol, CHAM_desc_t *Welt,
-                       RUNTIME_option_t *options)
+                       RUNTIME_option_t *options )
 {
     int m, n;
     int minMNT = chameleon_min( A->mt, A->nt );
@@ -58,7 +58,7 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
             int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
             int ldam = BLKLDD( A, m );
 
-            if ( (n == m)  && (uplo != ChamUpperLower) ) {
+            if ( (n == m) && (uplo != ChamUpperLower) ) {
                 INSERT_TASK_ztrasm(
                     options,
                     ChamColumnwise, uplo, diag, tempmm, tempnn,
@@ -95,7 +95,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
         INSERT_TASK_dlange(
             options,
             ChamMaxNorm, 1, tempnn, A->nb,
-            W( Wcol, 0, n), 1, W( Welt, 0, n));
+            W( Wcol, 0, n ), 1,
+            W( Welt, 0, n ) );
     }
 
     /**
@@ -105,7 +106,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
     for(n = Q; n < NT; n++) {
         INSERT_TASK_dlange_max(
             options,
-            W( Welt, 0, n), W( Welt, 0, n%Q) );
+            W( Welt, 0, n   ),
+            W( Welt, 0, n%Q ) );
     }
 
     /**
@@ -115,7 +117,8 @@ chameleon_pzlange_one( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A,
     for(n = 1; n < Q; n++) {
         INSERT_TASK_dlange_max(
             options,
-            W( Welt, 0, n), W( Welt, 0, 0) );
+            W( Welt, 0, n ),
+            W( Welt, 0, 0 ) );
     }
 }
 
@@ -247,13 +250,14 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
                 INSERT_TASK_zlange(
                     options,
                     ChamMaxNorm, tempmm, tempnn, A->nb,
-                    A(m, n), ldam, W( Welt, m, n));
+                    A(m, n), ldam, W( Welt, m, n ));
             }
 
             if ( n >= Q ) {
                 INSERT_TASK_dlange_max(
                     options,
-                    W( Welt, m, n), W( Welt, m, n%Q) );
+                    W( Welt, m, n   ),
+                    W( Welt, m, n%Q ) );
             }
         }
 
@@ -264,7 +268,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
         for(n = 1; n < Q; n++) {
             INSERT_TASK_dlange_max(
                 options,
-                W( Welt, m, n), W( Welt, m, 0) );
+                W( Welt, m, n ),
+                W( Welt, m, 0 ) );
         }
     }
 
@@ -275,7 +280,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
     for(m = P; m < MT; m++) {
         INSERT_TASK_dlange_max(
             options,
-            W( Welt, m, 0), W( Welt, m%P, 0) );
+            W( Welt, m,   0 ),
+            W( Welt, m%P, 0 ) );
     }
 
     /**
@@ -285,7 +291,8 @@ chameleon_pzlange_max( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
     for(m = 1; m < P; m++) {
         INSERT_TASK_dlange_max(
             options,
-            W( Welt, m, 0), W( Welt, 0, 0) );
+            W( Welt, m, 0 ),
+            W( Welt, 0, 0 ) );
     }
 }
 
@@ -382,7 +389,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
     double alpha = 0.0;
     double beta  = 0.0;
 
-    int workn, workmt, worknt;
+    int workmt, worknt;
     int m, n, wcol_init = 0;
 
     chamctxt = chameleon_context_self();
@@ -395,7 +402,6 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
 
     workmt = chameleon_max( A->mt, A->p );
     worknt = chameleon_max( A->nt, A->q );
-    workn  = chameleon_max( A->n,  A->q );
 
     switch ( norm ) {
     case ChamOneNorm:
@@ -502,7 +508,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
      */
     for(m = 0; m < A->p; m++) {
         for(n = 0; n < A->q; n++) {
-            if ( (m != 0) && (n != 0) ) {
+            if ( (m != 0) || (n != 0) ) {
                 INSERT_TASK_dlacpy(
                     &options,
                     ChamUpperLower, 1, 1, 1,
@@ -514,7 +520,7 @@ void chameleon_pzlange_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_dia
     CHAMELEON_Desc_Flush( &Welt, sequence );
     RUNTIME_sequence_wait(chamctxt, sequence);
 
-    *result = *(double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q );
+    *result = *((double *)Welt.get_blkaddr( &Welt, A->myrank / A->q, A->myrank % A->q ));
 
     if ( wcol_init ) {
         chameleon_desc_destroy( &Wcol );
diff --git a/compute/pzlansy.c b/compute/pzlansy.c
index c6d0ba73e..6a0e2bb00 100644
--- a/compute/pzlansy.c
+++ b/compute/pzlansy.c
@@ -315,7 +315,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
     double alpha = 0.0;
     double beta  = 0.0;
 
-    int workn, workmt, worknt;
+    int workmt, worknt;
     int m, n, wcol_init = 0;
 
     chamctxt = chameleon_context_self();
@@ -328,7 +328,6 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
 
     workmt = chameleon_max( A->mt, A->p );
     worknt = chameleon_max( A->nt, A->q );
-    workn  = chameleon_max( A->n,  A->q );
 
     switch ( norm ) {
     case ChamOneNorm:
@@ -415,7 +414,7 @@ void chameleon_pzlansy_generic( cham_normtype_t norm, cham_uplo_t uplo, cham_tra
      */
     for(m = 0; m < A->p; m++) {
         for(n = 0; n < A->q; n++) {
-            if ( (m != 0) && (n != 0) ) {
+            if ( (m != 0) || (n != 0) ) {
                 INSERT_TASK_dlacpy(
                     &options,
                     ChamUpperLower, 1, 1, 1,
diff --git a/control/common.h b/control/common.h
index 3c5dabf15..a9731f960 100644
--- a/control/common.h
+++ b/control/common.h
@@ -77,11 +77,8 @@
  *  Global shortcuts
  */
 #define CHAMELEON_RANK        chameleon_rank(chamctxt)
-#define CHAMELEON_SIZE        chamctxt->world_size
-#define CHAMELEON_GRPSIZE     chamctxt->group_size
 #define CHAMELEON_NB          chamctxt->nb
 #define CHAMELEON_IB          chamctxt->ib
-#define CHAMELEON_SCHEDULING  chamctxt->scheduling
 #define CHAMELEON_RHBLK       chamctxt->rhblock
 #define CHAMELEON_TRANSLATION chamctxt->translation
 #define CHAMELEON_PARALLEL    chamctxt->parallel_enabled
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index bc1fb9a05..ce3beb83e 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -109,8 +109,6 @@ typedef struct chameleon_context_s {
     int                my_mpi_rank;
     int                mpi_comm_size;
 #endif
-    int                world_size;
-    int                group_size;
 
     /* Boolean flags */
     cham_bool_t        warnings_enabled;
diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c
index bf72752ec..7ee45f66d 100644
--- a/runtime/openmp/codelets/codelet_zplssq.c
+++ b/runtime/openmp/codelets/codelet_zplssq.c
@@ -57,18 +57,18 @@
  *
  */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
-                        const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn )
+                         const CHAM_desc_t *IN,  int INm,  int INn,
+                         const CHAM_desc_t *OUT, int OUTm, int OUTn )
 {
-    double *scalesum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn);
-    double *scl = RTBLKADDR(SCLSSQ, double, SCLSSQm, SCLSSQn);
-#pragma omp task depend(in: scalesum[0]) depend(inout: scl[0])
+    double *sclssq_in  = RTBLKADDR(IN,  double, INm,  INn );
+    double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn);
+#pragma omp task depend(in: sclssq_in[0]) depend(inout: sclssq_out[0])
     {
-        if( scl[0] < scalesum[0] ) {
-            scl[1] = scalesum[1] + (scl[1]     * (( scl[0] / scalesum[0] ) * ( scl[0] / scalesum[0] )));
-            scl[0] = scalesum[0];
+        if( sclssq_out[0] < sclssq_in[0] ) {
+            sclssq_out[1] = sclssq_in[1]  + (sclssq_out[1] * (( sclssq_out[0] / sclssq_in[0] ) * ( sclssq_out[0] / sclssq_in[0] )));
+            sclssq_out[0] = sclssq_in[0];
         } else {
-            scl[1] = scl[1]     + (scalesum[1] * (( scalesum[0] / scl[0] ) * ( scalesum[0] / scl[0] )));
+            sclssq_out[1] = sclssq_out[1] + (sclssq_in[1]  * (( sclssq_in[0] / sclssq_out[0] ) * ( sclssq_in[0] / sclssq_out[0] )));
         }
     }
 }
diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c
index 6b5b493d1..ed1fde76f 100644
--- a/runtime/parsec/codelets/codelet_zplssq.c
+++ b/runtime/parsec/codelets/codelet_zplssq.c
@@ -56,17 +56,20 @@ static inline int
 CORE_zplssq_parsec( parsec_execution_stream_t *context,
                     parsec_task_t             *this_task )
 {
-    double *SCALESUMSQ;
-    double *SCLSSQ;
+    double *SCLSSQ_IN;
+    double *SCLSSQ_OUT;
 
     parsec_dtd_unpack_args(
-        this_task, &SCALESUMSQ, &SCLSSQ );
+        this_task, &SCLSSQ_IN, &SCLSSQ_OUT );
 
-    if( SCLSSQ[0] < SCALESUMSQ[0] ) {
-        SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1]     * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] )));
-        SCLSSQ[0] = SCALESUMSQ[0];
+    assert( SCLSSQ_OUT[0] >= 0. );
+    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
+        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
+        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
     } else {
-        SCLSSQ[1] = SCLSSQ[1]     + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] )));
+        if ( SCLSSQ_OUT[0] > 0 ) {
+            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
+        }
     }
 
     (void)context;
diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c
index 20b262564..79067050a 100644
--- a/runtime/quark/codelets/codelet_zplssq.c
+++ b/runtime/quark/codelets/codelet_zplssq.c
@@ -26,16 +26,19 @@
 
 void CORE_zplssq_quark(Quark *quark)
 {
-    double *SCALESUMSQ;
-    double *SCLSSQ;
+    double *SCLSSQ_IN;
+    double *SCLSSQ_OUT;
 
-    quark_unpack_args_2( quark, SCALESUMSQ, SCLSSQ );
+    quark_unpack_args_2( quark, SCLSSQ_IN, SCLSSQ_OUT );
 
-    if( SCLSSQ[0] < SCALESUMSQ[0] ) {
-        SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1]     * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] )));
-        SCLSSQ[0] = SCALESUMSQ[0];
+    assert( SCLSSQ_OUT[0] >= 0. );
+    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
+        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
+        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
     } else {
-        SCLSSQ[1] = SCLSSQ[1]     + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] )));
+        if ( SCLSSQ_OUT[0] > 0 ) {
+            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
+        }
     }
 }
 
diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index c5541b1d7..7c296d750 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -120,13 +120,13 @@ static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg)
     int M;
     int N;
     CHAMELEON_Complex64_t alpha;
-    CHAMELEON_Complex64_t *A;
+    const CHAMELEON_Complex64_t *A;
     int LDA;
     CHAMELEON_Complex64_t beta;
     CHAMELEON_Complex64_t *B;
     int LDB;
 
-    A = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    A = (const CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
     B = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &LDA, &beta, &LDB);
     CORE_zgeadd(trans, M, N, alpha, A, LDA, beta, B, LDB);
diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c
index a614181d0..0d2426400 100644
--- a/runtime/starpu/codelets/codelet_zlacpy.c
+++ b/runtime/starpu/codelets/codelet_zlacpy.c
@@ -32,17 +32,17 @@
  *
  */
 void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int m, int n, int nb,
-                        int displA, const CHAM_desc_t *A, int Am, int An, int lda,
-                        int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb)
+                         cham_uplo_t uplo, int m, int n, int nb,
+                         int displA, const CHAM_desc_t *A, int Am, int An, int lda,
+                         int displB, const CHAM_desc_t *B, int Bm, int Bn, int ldb)
 {
     (void)nb;
     struct starpu_codelet *codelet = &cl_zlacpy;
     void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_R(A, Am, An);
-    CHAMELEON_ACCESS_W(B, Bm, Bn);
+    CHAMELEON_ACCESS_R( A, Am, An );
+    CHAMELEON_ACCESS_W( B, Bm, Bn );
     CHAMELEON_END_ACCESS_DECLARATION;
 
     starpu_insert_task(
@@ -65,13 +65,13 @@ void INSERT_TASK_zlacpyx(const RUNTIME_option_t *options,
 }
 
 void INSERT_TASK_zlacpy(const RUNTIME_option_t *options,
-                       cham_uplo_t uplo, int m, int n, int nb,
-                       const CHAM_desc_t *A, int Am, int An, int lda,
-                       const CHAM_desc_t *B, int Bm, int Bn, int ldb)
+                        cham_uplo_t uplo, int m, int n, int nb,
+                        const CHAM_desc_t *A, int Am, int An, int lda,
+                        const CHAM_desc_t *B, int Bm, int Bn, int ldb)
 {
     INSERT_TASK_zlacpyx( options, uplo, m, n, nb,
-                        0, A, Am, An, lda,
-                        0, B, Bm, Bn, ldb );
+                         0, A, Am, An, lda,
+                         0, B, Bm, Bn, ldb );
 }
 
 #if !defined(CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c
index b65dc0059..43b4314c6 100644
--- a/runtime/starpu/codelets/codelet_zlag2c.c
+++ b/runtime/starpu/codelets/codelet_zlag2c.c
@@ -85,24 +85,25 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_clag2z;
     void (*callback)(void*) = options->profiling ? cl_clag2z_callback : NULL;
 
-    if ( chameleon_desc_islocal( A, Am, An ) ||
-         chameleon_desc_islocal( B, Bm, Bn ) )
-    {
-        starpu_insert_task(
-            starpu_mpi_codelet(codelet),
-            STARPU_VALUE,    &m,                 sizeof(int),
-            STARPU_VALUE,    &n,                 sizeof(int),
-            STARPU_R,         RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An),
-            STARPU_VALUE,    &lda,               sizeof(int),
-            STARPU_W,         RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
-            STARPU_VALUE,    &ldb,               sizeof(int),
-            STARPU_PRIORITY,  options->priority,
-            STARPU_CALLBACK,  callback,
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R( A, Am, An );
+    CHAMELEON_ACCESS_W( B, Bm, Bn );
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &m,                 sizeof(int),
+        STARPU_VALUE,    &n,                 sizeof(int),
+        STARPU_R,         RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An),
+        STARPU_VALUE,    &lda,               sizeof(int),
+        STARPU_W,         RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        STARPU_VALUE,    &ldb,               sizeof(int),
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "clag2z",
+        STARPU_NAME, "clag2z",
 #endif
-            0);
-    }
+        0);
 }
 
 
diff --git a/runtime/starpu/codelets/codelet_zlange.c b/runtime/starpu/codelets/codelet_zlange.c
index a90de22ca..f689d82bc 100644
--- a/runtime/starpu/codelets/codelet_zlange.c
+++ b/runtime/starpu/codelets/codelet_zlange.c
@@ -70,7 +70,7 @@ static void cl_zlange_cpu_func(void *descr[], void *cl_arg)
     work  = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
     normA = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
     starpu_codelet_unpack_args(cl_arg, &norm, &M, &N, &LDA);
-    CORE_zlange( norm, M, N, A, LDA, work, normA);
+    CORE_zlange( norm, M, N, A, LDA, work, normA );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -86,34 +86,35 @@ void INSERT_TASK_zlange_max(const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_zlange_max;
     void (*callback)(void*) = options->profiling ? cl_zlange_callback : NULL;
 
-    if ( chameleon_desc_islocal( A, Am, An ) ||
-         chameleon_desc_islocal( B, Bm, Bn ) )
-    {
-        starpu_insert_task(
-            starpu_mpi_codelet(codelet),
-            STARPU_R,        RTBLKADDR(A, double, Am, An),
-            STARPU_RW,       RTBLKADDR(B, double, Bm, Bn),
-            STARPU_PRIORITY, options->priority,
-            STARPU_CALLBACK, callback,
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R(  A, Am, An );
+    CHAMELEON_ACCESS_RW( B, Bm, Bn );
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_R,        RTBLKADDR(A, double, Am, An),
+        STARPU_RW,       RTBLKADDR(B, double, Bm, Bn),
+        STARPU_PRIORITY, options->priority,
+        STARPU_CALLBACK, callback,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "zlange_max",
+        STARPU_NAME, "zlange_max",
 #endif
-            0);
-    }
+        0);
 }
 
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zlange_max_cpu_func(void *descr[], void *cl_arg)
 {
     double *A;
-    double *normA;
-
-    A     = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
-    normA = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+    double *B;
 
-    if ( *A > *normA )
-        *normA = *A;
+    A = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+    B = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
 
+    if ( *A > *B ) {
+        *B = *A;
+    }
     (void)cl_arg;
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c
index eb79e712b..4fdbaf6c3 100644
--- a/runtime/starpu/codelets/codelet_zplssq.c
+++ b/runtime/starpu/codelets/codelet_zplssq.c
@@ -55,21 +55,21 @@
  *
  */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
-                        const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn )
+                        const CHAM_desc_t *SCLSSQ_IN,  int SCLSSQ_INm,  int SCLSSQ_INn,
+                        const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn )
 {
     struct starpu_codelet *codelet = &cl_zplssq;
     void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_R(SCALESUMSQ, SCALESUMSQm, SCALESUMSQn);
-    CHAMELEON_ACCESS_RW(SCLSSQ, SCLSSQm, SCLSSQn);
+    CHAMELEON_ACCESS_R(  SCLSSQ_IN,  SCLSSQ_INm,  SCLSSQ_INn  );
+    CHAMELEON_ACCESS_RW( SCLSSQ_OUT, SCLSSQ_OUTm, SCLSSQ_OUTn );
     CHAMELEON_END_ACCESS_DECLARATION;
 
     starpu_insert_task(
         starpu_mpi_codelet(codelet),
-        STARPU_R,  RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn),
-        STARPU_RW, RTBLKADDR(SCLSSQ,     double, SCLSSQm,     SCLSSQn),
+        STARPU_R,  RTBLKADDR( SCLSSQ_IN,  double, SCLSSQ_INm,  SCLSSQ_INn  ),
+        STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double, SCLSSQ_OUTm, SCLSSQ_OUTn ),
         STARPU_PRIORITY,    options->priority,
         STARPU_CALLBACK,    callback,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
@@ -82,17 +82,20 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zplssq_cpu_func(void *descr[], void *cl_arg)
 {
-    double *SCALESUMSQ;
-    double *SCLSSQ;
+    double *SCLSSQ_IN;
+    double *SCLSSQ_OUT;
 
-    SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
-    SCLSSQ     = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+    SCLSSQ_IN  = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+    SCLSSQ_OUT = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
 
-    if( SCLSSQ[0] < SCALESUMSQ[0] ) {
-        SCLSSQ[1] = SCALESUMSQ[1] + (SCLSSQ[1]     * (( SCLSSQ[0] / SCALESUMSQ[0] ) * ( SCLSSQ[0] / SCALESUMSQ[0] )));
-        SCLSSQ[0] = SCALESUMSQ[0];
+    assert( SCLSSQ_OUT[0] >= 0. );
+    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
+        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
+        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
     } else {
-        SCLSSQ[1] = SCLSSQ[1]     + (SCALESUMSQ[1] * (( SCALESUMSQ[0] / SCLSSQ[0] ) * ( SCALESUMSQ[0] / SCLSSQ[0] )));
+        if ( SCLSSQ_OUT[0] > 0 ) {
+            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
+        }
     }
 
     (void)cl_arg;
@@ -110,17 +113,19 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
     struct starpu_codelet *codelet = &cl_zplssq2;
     void (*callback)(void*) = options->profiling ? cl_zplssq2_callback : NULL;
 
-    if ( chameleon_desc_islocal( RESULT, RESULTm, RESULTn ) ) {
-        starpu_insert_task(
-            starpu_mpi_codelet(codelet),
-            STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn),
-            STARPU_PRIORITY,    options->priority,
-            STARPU_CALLBACK,    callback,
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_RW( RESULT, RESULTm, RESULTn );
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn),
+        STARPU_PRIORITY,    options->priority,
+        STARPU_CALLBACK,    callback,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-            STARPU_NAME, "zplssq2",
+        STARPU_NAME, "zplssq2",
 #endif
-            0);
-    }
+        0);
 }
 
 
diff --git a/runtime/starpu/codelets/codelet_ztrasm.c b/runtime/starpu/codelets/codelet_ztrasm.c
index 4e0dc05a6..c1d154aad 100644
--- a/runtime/starpu/codelets/codelet_ztrasm.c
+++ b/runtime/starpu/codelets/codelet_ztrasm.c
@@ -66,8 +66,8 @@ static void cl_ztrasm_cpu_func(void *descr[], void *cl_arg)
     int lda;
     double *work;
 
-    A     = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    work  = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+    A    = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
+    work = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
     starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &diag, &M, &N, &lda);
     CORE_ztrasm(storev, uplo, diag, M, N, A, lda, work);
 }
diff --git a/testing/testing_zauxiliary.c b/testing/testing_zauxiliary.c
index 2c0f361db..6ebe37715 100644
--- a/testing/testing_zauxiliary.c
+++ b/testing/testing_zauxiliary.c
@@ -182,6 +182,7 @@ int main (int argc, char **argv)
     CHAMELEON_Disable(CHAMELEON_AUTOTUNING);
     CHAMELEON_Set(CHAMELEON_TILE_SIZE,        nb );
     CHAMELEON_Set(CHAMELEON_INNER_BLOCK_SIZE, ib );
+    CHAMELEON_user_tag_size( 64, 54 );
 
     argc -= 6;
     argv += 6;
diff --git a/testing/testing_zlange.c b/testing/testing_zlange.c
index 864b1e4ce..34fde1bc4 100644
--- a/testing/testing_zlange.c
+++ b/testing/testing_zlange.c
@@ -51,11 +51,9 @@ int testing_zlange(int argc, char **argv)
 
     /* Allocate Data */
     CHAMELEON_Complex64_t *A    = (CHAMELEON_Complex64_t *)malloc(LDAxN*sizeof(CHAMELEON_Complex64_t));
-    double            *work = (double*) malloc(max(M,N)*sizeof(double));
+    double                *work = (double*) malloc(max(M,N)*sizeof(double));
     double normcham, normlapack, result;
 
-    RUNTIME_comm_set_tag_sizes( 31, 16 );
-
     eps = LAPACKE_dlamch_work('e');
 
     printf("\n");
-- 
GitLab