diff --git a/compute/pzlange.c b/compute/pzlange.c
index b5f58e47d25fc1c231faefa374e85b9b8e250b79..f909fa25fa34ff17bc6d0c72b811916a187d284e 100644
--- a/compute/pzlange.c
+++ b/compute/pzlange.c
@@ -329,13 +329,14 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
             else {
                 INSERT_TASK_zgessq(
                     options,
+                    ChamEltwise,
                     tempmm, tempnn,
                     A(m, n), ldam, W( Welt, m, n) );
             }
 
             if ( n >= Q ) {
                 INSERT_TASK_dplssq(
-                    options, W( Welt, m, n), W( Welt, m, n%Q) );
+                    options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) );
             }
         }
 
@@ -345,7 +346,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
          */
         for(n = 1; n < Q; n++) {
             INSERT_TASK_dplssq(
-                options, W( Welt, m, n), W( Welt, m, 0) );
+                options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
         }
     }
 
@@ -355,7 +356,7 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
      */
     for(m = P; m < MT; m++) {
         INSERT_TASK_dplssq(
-            options, W( Welt, m, 0), W( Welt, m%P, 0) );
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
     }
 
     /**
@@ -364,11 +365,11 @@ chameleon_pzlange_frb( cham_uplo_t uplo, cham_diag_t diag, CHAM_desc_t *A, CHAM_
      */
     for(m = 1; m < P; m++) {
         INSERT_TASK_dplssq(
-            options, W( Welt, m, 0), W( Welt, 0, 0) );
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
     }
 
     INSERT_TASK_dplssq2(
-        options, W( Welt, 0, 0) );
+        options, 1, W( Welt, 0, 0) );
 }
 
 /**
diff --git a/compute/pzlansy.c b/compute/pzlansy.c
index 71189ef578ffa2fd5f5b3d3feac28508d7bb86c5..e799f57285430e09eccd348abdca54d0c31d6cdc 100644
--- a/compute/pzlansy.c
+++ b/compute/pzlansy.c
@@ -241,21 +241,21 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
             if ( n == m ) {
                 if ( trans == ChamConjTrans) {
                     INSERT_TASK_zhessq(
-                        options, uplo, tempmm,
+                        options, ChamEltwise, uplo, tempmm,
                         A(m, n), ldam, W( Welt, m, n) );
                 }
                 else {
                     INSERT_TASK_zsyssq(
-                        options, uplo, tempmm,
+                        options, ChamEltwise, uplo, tempmm,
                         A(m, n), ldam, W( Welt, m, n) );
                 }
             }
             else {
                 INSERT_TASK_zgessq(
-                    options, tempmm, tempnn,
+                    options, ChamEltwise, tempmm, tempnn,
                     A(m, n), ldam, W( Welt, m, n) );
                 INSERT_TASK_zgessq(
-                    options, tempmm, tempnn,
+                    options, ChamEltwise, tempmm, tempnn,
                     A(m, n), ldam, W( Welt, n, m) );
             }
         }
@@ -264,7 +264,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
     for(m = 0; m < MT; m++) {
         for(n = Q; n < NT; n++) {
             INSERT_TASK_dplssq(
-                options, W( Welt, m, n), W( Welt, m, n%Q) );
+                options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, n%Q) );
         }
 
         /**
@@ -273,7 +273,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
          */
         for(n = 1; n < Q; n++) {
             INSERT_TASK_dplssq(
-                options, W( Welt, m, n), W( Welt, m, 0) );
+                options, ChamEltwise, 1, 1, W( Welt, m, n), W( Welt, m, 0) );
         }
     }
 
@@ -283,7 +283,7 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
      */
     for(m = P; m < MT; m++) {
         INSERT_TASK_dplssq(
-            options, W( Welt, m, 0), W( Welt, m%P, 0) );
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, m%P, 0) );
     }
 
     /**
@@ -292,11 +292,11 @@ chameleon_pzlansy_frb( cham_trans_t trans, cham_uplo_t uplo,
      */
     for(m = 1; m < P; m++) {
         INSERT_TASK_dplssq(
-            options, W( Welt, m, 0), W( Welt, 0, 0) );
+            options, ChamEltwise, 1, 1, W( Welt, m, 0), W( Welt, 0, 0) );
     }
 
     INSERT_TASK_dplssq2(
-        options, W( Welt, 0, 0) );
+        options, 1, W( Welt, 0, 0) );
 }
 
 /**
diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index 009cd37dbb5112325e7622b5eef71715d661fde1..d44a83fd7a2480d0c5197f4b4d4477aebfa4be16 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -66,6 +66,7 @@ set(ZSRC
     core_zplghe.c
     core_zplgsy.c
     core_zplrnt.c
+    core_zplssq.c
     core_zpotrf.c
     core_zssssm.c
     core_zsymm.c
diff --git a/coreblas/compute/core_zgessq.c b/coreblas/compute/core_zgessq.c
index aba33e5f5102baf1632332be7cbbc2d97d2259c5..71aa407bdebde121a438604a3d08a9145b802e2f 100644
--- a/coreblas/compute/core_zgessq.c
+++ b/coreblas/compute/core_zgessq.c
@@ -15,23 +15,98 @@
  * @comment This file has been automatically generated
  *          from Plasma 2.6.0 for CHAMELEON 0.9.2
  * @author Mathieu Faverge
+ * @author Florent Pruvost
  * @date 2014-11-16
  * @precisions normal z -> c d s
  *
  */
 #include <math.h>
 #include "coreblas/lapacke.h"
+#include "coreblas/sumsq_update.h"
 #include "coreblas.h"
 
-#define UPDATE( __nb, __value )                                         \
-    if (__value != 0. ){                                                \
-        if ( *scale < __value ) {                                       \
-            *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \
-            *scale = __value;                                           \
-        } else {                                                        \
-            *sumsq = *sumsq + __nb * ( __value / *scale ) *  ( __value / *scale ); \
-        }                                                               \
+/**
+ * @brief Subcase storev == ChamColumnwise of CORE_zgessq()
+ */
+static inline int
+CORE_zgessq_col( int M, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr, *tmpScale, *tmpSumsq;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * LDA );
+        tmpScale = sclssq+2*j;
+        tmpSumsq = sclssq+2*j+1;
+        for(i=0; i<M; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
+#endif
+        }
     }
+    return CHAMELEON_SUCCESS;
+}
+/**
+ * @brief Subcase storev == ChamRowwise of CORE_zgessq()
+ */
+static inline int
+CORE_zgessq_row( int M, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr, *tmpScale, *tmpSumsq;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * LDA );
+        tmpScale = sclssq;
+        tmpSumsq = sclssq+1;
+        for(i=0; i<M; i++, ptr++, tmpScale+=2, tmpSumsq+=2) {
+            tmp = fabs(*ptr);
+            sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 1., tmpScale, tmpSumsq, &tmp );
+#endif
+        }
+    }
+    return CHAMELEON_SUCCESS;
+}
+/**
+ * @brief Subcase storev == ChamEltwise of CORE_zgessq()
+ */
+static inline int
+CORE_zgessq_elt( int M, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * LDA );
+        for(i=0; i<M; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#endif
+        }
+    }
+    return CHAMELEON_SUCCESS;
+}
 
 /**
  *
@@ -60,6 +135,12 @@
  *
  *******************************************************************************
  *
+ * @param[in] storev
+ *          Specifies whether the sums are made per column or row.
+ *          = ChamColumnwise: Computes the sum of squares on each column
+ *          = ChamRowwise:    Computes the sum of squares on each row
+ *          = ChamEltwise:    Computes the sum of squares on all the matrix
+ *
  *  @param[in] M
  *          The number of rows in the tile A.
  *
@@ -72,13 +153,16 @@
  *  @param[in] LDA
  *          The leading dimension of the tile A. LDA >= max(1,M).
  *
- *  @param[in,out] scale
- *          On entry, the value  scale  in the equation above.
- *          On exit, scale is overwritten with the value scl.
- *
- *  @param[in,out] sumsq
- *          On entry, the value  sumsq  in the equation above.
- *          On exit, SUMSQ is overwritten with the value ssq.
+ *  @param[in,out] sclssq
+ *          Dimension:  (2,K)
+ *          if storev == ChamColumnwise, K = N
+ *          if storev == ChamRowwise,    K = M
+ *          if storev == ChamEltwise,    K = 1
+ *          On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1])
+ *          which corresponds to (scale, sumsq) in the equation below
+ *          ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
+ *          respectively for the columns, the rows and the full matrix
+ *          On exit, each couple is overwritten with the final result (scl, ssq).
  *
  *******************************************************************************
  *
@@ -86,26 +170,18 @@
  * @retval -k, the k-th argument had an illegal value
  *
  */
-int CORE_zgessq(int M, int N,
-                const CHAMELEON_Complex64_t *A, int LDA,
-                double *scale, double *sumsq)
+int CORE_zgessq( cham_store_t storev, int M, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
 {
-    int i, j;
-    double tmp;
-    double *ptr;
-
-    for(j=0; j<N; j++) {
-        ptr = (double*) ( A + j * LDA );
-        for(i=0; i<M; i++, ptr++) {
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-            ptr++;
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-#endif
-        }
+    if (storev == ChamColumnwise) {
+        CORE_zgessq_col( M, N, A, LDA, sclssq );
+    }
+    else if (storev == ChamRowwise) {
+        CORE_zgessq_row( M, N, A, LDA, sclssq );
+    }
+    else {
+        CORE_zgessq_elt( M, N, A, LDA, sclssq );
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/coreblas/compute/core_zhessq.c b/coreblas/compute/core_zhessq.c
index 482cc6fb42d14acf3f945801f645e734001ce317..255e7e7f716f6b2cd3475c11aefac373814a2ae8 100644
--- a/coreblas/compute/core_zhessq.c
+++ b/coreblas/compute/core_zhessq.c
@@ -19,20 +19,8 @@
  * @precisions normal z -> c
  *
  */
-#include <math.h>
-#include "coreblas/lapacke.h"
 #include "coreblas.h"
 
-#define UPDATE( __nb, __value )                                         \
-    if (__value != 0. ){                                                \
-        if ( *scale < __value ) {                                       \
-            *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \
-            *scale = __value;                                           \
-        } else {                                                        \
-            *sumsq = *sumsq + __nb * ( __value / *scale ) *  ( __value / *scale ); \
-        }                                                               \
-    }
-
 /**
  *
  * @ingroup CORE_CHAMELEON_Complex64_t
@@ -56,18 +44,24 @@
  * SCALE and SUMSQ are overwritten by scl and ssq respectively.
  *
  * The routine makes only one pass through the tile triangular part of the
- * hermitian tile A defined by uplo.
+ * symmetric tile A defined by uplo.
  * See also LAPACK zlassq.f
  *
  *******************************************************************************
  *
+ * @param[in] storev
+ *          Specifies whether the sums are made per column or row.
+ *          = ChamColumnwise: Computes the sum of squares on each column
+ *          = ChamRowwise:    Computes the sum of squares on each row
+ *          = ChamEltwise:    Computes the sum of squares on all the matrix
+ *
  *  @param[in] uplo
  *          Specifies whether the upper or lower triangular part of
- *          the hermitian matrix A is to be referenced as follows:
+ *          the symmetric matrix A is to be referenced as follows:
  *          = ChamLower:     Only the lower triangular part of the
- *                             hermitian matrix A is to be referenced.
+ *                             symmetric matrix A is to be referenced.
  *          = ChamUpper:     Only the upper triangular part of the
- *                             hermitian matrix A is to be referenced.
+ *                             symmetric matrix A is to be referenced.
  *
  *  @param[in] N
  *          The number of columns and rows in the tile A.
@@ -78,13 +72,16 @@
  *  @param[in] LDA
  *          The leading dimension of the tile A. LDA >= max(1,N).
  *
- *  @param[in,out] scale
- *          On entry, the value  scale  in the equation above.
- *          On exit, scale is overwritten with the value scl.
- *
- *  @param[in,out] sumsq
- *          On entry, the value  sumsq  in the equation above.
- *          On exit, SUMSQ is overwritten with the value ssq.
+ *  @param[in,out] sclssq
+ *          Dimension:  (2,K)
+ *          if storev == ChamColumnwise, K = N
+ *          if storev == ChamRowwise,    K = N
+ *          if storev == ChamEltwise,    K = 1
+ *          On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1])
+ *          which corresponds to (scale, sumsq) in the equation below
+ *          ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
+ *          respectively for the columns, the rows and the full matrix
+ *          On exit, each couple is overwritten with the final result (scl, ssq).
  *
  *******************************************************************************
  *
@@ -92,65 +89,9 @@
  * @retval -k, the k-th argument had an illegal value
  *
  */
-
-int CORE_zhessq(cham_uplo_t uplo, int N,
-                const CHAMELEON_Complex64_t *A, int LDA,
-                double *scale, double *sumsq)
+int CORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
 {
-    int i, j;
-    double tmp;
-    double *ptr;
-
-    if ( uplo == ChamUpper ) {
-        for(j=0; j<N; j++) {
-            ptr = (double*) ( A + j * LDA );
-
-            for(i=0; i<j; i++, ptr++) {
-
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-                ptr++;
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-#endif
-            }
-
-            /* Diagonal */
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-            ptr++;
-#endif
-        }
-    } else {
-
-        for(j=0; j<N; j++) {
-            ptr = (double*) ( A + j * LDA + j);
-
-            /* Diagonal */
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-            ptr++;
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-            ptr++;
-#endif
-
-            for(i=j+1; i<N; i++, ptr++) {
-
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-                ptr++;
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-#endif
-            }
-        }
-    }
-    return CHAMELEON_SUCCESS;
+    return CORE_zsyssq( storev, uplo, N, A, LDA, sclssq );
 }
diff --git a/coreblas/compute/core_zplssq.c b/coreblas/compute/core_zplssq.c
new file mode 100644
index 0000000000000000000000000000000000000000..9549453dcd50df72fc2780eadc80d81ceeac9a83
--- /dev/null
+++ b/coreblas/compute/core_zplssq.c
@@ -0,0 +1,193 @@
+/**
+ *
+ * @file core_zplssq.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon core_zplssq CPU kernel
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-01
+ * @precisions normal z -> c d s
+ *
+ */
+#include <math.h>
+#include "coreblas/lapacke.h"
+#include "coreblas/sumsq_update.h"
+#include "coreblas.h"
+
+/**
+ * @brief Subcase storev == ChamColumnwise of CORE_zplssq()
+ */
+static inline int
+CORE_zplssq_col( int M, int N,
+                 double *sclssqin,
+                 double *sclssqout )
+{
+    int i, j;
+    double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut;
+
+    tmpScaleIn = sclssqin;
+    tmpSumsqIn = sclssqin+1;
+    tmpScaleOut = sclssqout;
+    tmpSumsqOut = sclssqout+1;
+
+    for(j=0; j<N; j++) {
+        for(i=0; i<M; i++) {
+            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            tmpScaleIn+=2;
+            tmpSumsqIn+=2;
+        }
+        tmpScaleOut+=2;
+        tmpSumsqOut+=2;
+    }
+    return CHAMELEON_SUCCESS;
+}
+/**
+ * @brief Subcase storev == ChamRowwise of CORE_zplssq()
+ */
+static inline int
+CORE_zplssq_row( int M, int N,
+                 double *sclssqin,
+                 double *sclssqout )
+{
+    int i, j;
+    double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut;
+
+    tmpScaleIn = sclssqin;
+    tmpSumsqIn = sclssqin+1;
+
+    for(j=0; j<N; j++) {
+        tmpScaleOut = sclssqout;
+        tmpSumsqOut = sclssqout+1;
+        for(i=0; i<M; i++) {
+            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            tmpScaleIn+=2;
+            tmpSumsqIn+=2;
+            tmpScaleOut+=2;
+            tmpSumsqOut+=2;
+        }
+    }
+    return CHAMELEON_SUCCESS;
+}
+/**
+ * @brief Subcase storev == ChamEltwise of CORE_zplssq()
+ */
+static inline int
+CORE_zplssq_elt( int M, int N,
+                 double *sclssqin,
+                 double *sclssqout )
+{
+    int i, j;
+    double *tmpScaleIn, *tmpSumsqIn, *tmpScaleOut, *tmpSumsqOut;
+
+    tmpScaleIn = sclssqin;
+    tmpSumsqIn = sclssqin+1;
+    tmpScaleOut = sclssqout;
+    tmpSumsqOut = sclssqout+1;
+
+    for(j=0; j<N; j++) {
+        for(i=0; i<M; i++) {
+            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            tmpScaleIn+=2;
+            tmpSumsqIn+=2;
+        }
+    }
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ *  CORE_zplssq adds the values ssq ensuring scl >= 0
+ *
+ *******************************************************************************
+ *
+ * @param[in] storev
+ *          Specifies whether the sums are made per column or row.
+ *          = ChamColumnwise: Computes the sum of squares on each column
+ *          = ChamRowwise:    Computes the sum of squares on each row
+ *          = ChamEltwise:    Computes the sum of squares on all the matrix
+ *
+ *  @param[in] M
+ *          The number of rows in the tile sclssqin.
+ *
+ *  @param[in] N
+ *          The number of columns in the tile sclssqin.
+ *
+ *  @param[in] sclssqin
+ *          The 2*M-by-2*N matrix on which to compute the sums.
+ *
+ *  @param[in,out] sclssqout
+ *          Dimension:  (2,K)
+ *          if storev == ChamColumnwise, K = N
+ *          if storev == ChamRowwise,    K = M
+ *          if storev == ChamEltwise,    K = 1
+ *          On entry, sclssqout contains M-by-N couples (sclssqout[2*i], sclssqout[2*i+1])
+ *          which corresponds to (scale, sumsq) in the equation below
+ *          ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
+ *          respectively for the columns, the rows and the full matrix
+ *          On exit, each couple is overwritten with the final result (scl, ssq).
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval -k, the k-th argument had an illegal value
+ *
+ */
+int CORE_zplssq( cham_store_t storev, int M, int N,
+                 double *sclssqin,
+                 double *sclssqout )
+{
+    if (storev == ChamColumnwise) {
+        CORE_zplssq_col( M, N, sclssqin, sclssqout );
+    }
+    else if (storev == ChamRowwise) {
+        CORE_zplssq_row( M, N, sclssqin, sclssqout );
+    }
+    else {
+        CORE_zplssq_elt( M, N, sclssqin, sclssqout );
+    }
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ *  CORE_zplssq2 computes scl*sqrt(ssq) for each couple in a vector
+ *
+ *******************************************************************************
+ *
+ *  @param[in] N
+ *          The number of columns in the tile sclssq.
+ *
+ *  @param[in,out] sclssq
+ *          The 2*N matrix on which to compute scl*sqrt(ssq)
+ *          On entry contains all couple (scl,ssq) in (sclssq[i],sclssq[i+1])
+ *          On exist returns sclssq[i]=sclssq[i]*sqrt(sclssq[i+1])
+ *          so that the result is stored in the first line.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval -k, the k-th argument had an illegal value
+ *
+ */
+int CORE_zplssq2( int N,
+                  double *sclssq )
+{
+    int i;
+    for (i=0; i<N; i+=2) {
+        sclssq[i] *= sqrt ( sclssq[i+1] );
+    }
+    return CHAMELEON_SUCCESS;
+}
diff --git a/coreblas/compute/core_zsyssq.c b/coreblas/compute/core_zsyssq.c
index c7abf5a71d25a37c353d0bcb57409c694d89c63a..dfad10e3e6d90b61e737c6a283452c791c37bea6 100644
--- a/coreblas/compute/core_zsyssq.c
+++ b/coreblas/compute/core_zsyssq.c
@@ -21,17 +21,155 @@
  */
 #include <math.h>
 #include "coreblas/lapacke.h"
+#include "coreblas/sumsq_update.h"
 #include "coreblas.h"
 
-#define UPDATE( __nb, __value )                                         \
-    if (__value != 0. ){                                                \
-        if ( *scale < __value ) {                                       \
-            *sumsq = __nb + (*sumsq) * ( *scale / __value ) * ( *scale / __value ); \
-            *scale = __value;                                           \
-        } else {                                                        \
-            *sumsq = *sumsq + __nb * ( __value / *scale ) *  ( __value / *scale ); \
-        }                                                               \
+/**
+ * @brief Subcase uplo == ChamUpper, storev == ChamColumnwise || ChamRowwise of CORE_zsyssq()
+ */
+static inline int
+CORE_zsyssq_up_col( int N,
+                    const CHAMELEON_Complex64_t *A, int LDA,
+                    double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * LDA );
+        for(i=0; i<j; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp );
+            sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp );
+            sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#endif
+        }
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+        ptr++;
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#endif
+    }
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ * @brief Subcase uplo == ChamLower, storev == ChamColumnwise || ChamRowwise of CORE_zsyssq()
+ */
+static inline int
+CORE_zsyssq_lo_col( int N,
+                    const CHAMELEON_Complex64_t *A, int LDA,
+                    double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * (LDA + 1) );
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+        ptr++;
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#endif
+        ptr++;
+        for(i=j+1; i<N; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp );
+            sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 1., sclssq+2*i, sclssq+2*i+1, &tmp );
+            sumsq_update( 1., sclssq+2*j, sclssq+2*j+1, &tmp );
+#endif
+        }
     }
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ * @brief Subcase uplo == ChamUpper, storev == ChamEltwise of CORE_zsyssq()
+ */
+static inline int
+CORE_zsyssq_up_elt( int N,
+                    const CHAMELEON_Complex64_t *A, int LDA,
+                    double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * LDA );
+
+        for(i=0; i<j; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 2., sclssq, sclssq+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 2., sclssq, sclssq+1, &tmp );
+#endif
+        }
+
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+        ptr++;
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#endif
+    }
+    return CHAMELEON_SUCCESS;
+}
+
+/**
+ * @brief Subcase uplo == ChamLower, storev == ChamEltwise of CORE_zsyssq()
+ */
+static inline int
+CORE_zsyssq_lo_elt( int N,
+                    const CHAMELEON_Complex64_t *A, int LDA,
+                    double *sclssq )
+{
+    int i, j;
+    double tmp;
+    double *ptr;
+
+    for(j=0; j<N; j++) {
+        ptr = (double*) ( A + j * (LDA + 1) );
+
+
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+        ptr++;
+        tmp = fabs(*ptr);
+        sumsq_update( 1., sclssq, sclssq+1, &tmp );
+#endif
+        ptr++;
+
+        for(i=j+1; i<N; i++, ptr++) {
+            tmp = fabs(*ptr);
+            sumsq_update( 2., sclssq, sclssq+1, &tmp );
+#if defined(PRECISION_z) || defined(PRECISION_c)
+            ptr++;
+            tmp = fabs(*ptr);
+            sumsq_update( 2., sclssq, sclssq+1, &tmp );
+#endif
+        }
+    }
+    return CHAMELEON_SUCCESS;
+}
 
 /**
  *
@@ -61,6 +199,12 @@
  *
  *******************************************************************************
  *
+ * @param[in] storev
+ *          Specifies whether the sums are made per column or row.
+ *          = ChamColumnwise: Computes the sum of squares on each column
+ *          = ChamRowwise:    Computes the sum of squares on each row
+ *          = ChamEltwise:    Computes the sum of squares on all the matrix
+ *
  *  @param[in] uplo
  *          Specifies whether the upper or lower triangular part of
  *          the symmetric matrix A is to be referenced as follows:
@@ -78,13 +222,16 @@
  *  @param[in] LDA
  *          The leading dimension of the tile A. LDA >= max(1,N).
  *
- *  @param[in,out] scale
- *          On entry, the value  scale  in the equation above.
- *          On exit, scale is overwritten with the value scl.
- *
- *  @param[in,out] sumsq
- *          On entry, the value  sumsq  in the equation above.
- *          On exit, SUMSQ is overwritten with the value ssq.
+ *  @param[in,out] sclssq
+ *          Dimension:  (2,K)
+ *          if storev == ChamColumnwise, K = N
+ *          if storev == ChamRowwise,    K = N
+ *          if storev == ChamEltwise,    K = 1
+ *          On entry, sclssq contains K couples (sclssq[2*i], sclssq[2*i+1])
+ *          which corresponds to (scale, sumsq) in the equation below
+ *          ( scl**2 )*ssq = sum( A( i, j )**2 ) + ( scale**2 )*sumsq,
+ *          respectively for the columns, the rows and the full matrix
+ *          On exit, each couple is overwritten with the final result (scl, ssq).
  *
  *******************************************************************************
  *
@@ -93,67 +240,21 @@
  *
  */
 
-int CORE_zsyssq(cham_uplo_t uplo, int N,
-                const CHAMELEON_Complex64_t *A, int LDA,
-                double *scale, double *sumsq)
+int CORE_zsyssq( cham_store_t storev, cham_uplo_t uplo, int N,
+                 const CHAMELEON_Complex64_t *A, int LDA,
+                 double *sclssq )
 {
-    int i, j;
-    double tmp;
-    double *ptr;
-
     if ( uplo == ChamUpper ) {
-        for(j=0; j<N; j++) {
-            ptr = (double*) ( A + j * LDA );
-
-            for(i=0; i<j; i++, ptr++) {
-
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-                ptr++;
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-#endif
-            }
-
-            /* Diagonal */
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-            ptr++;
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-#endif
+        if ( storev == ChamEltwise ) {
+            CORE_zsyssq_up_elt( N, A, LDA, sclssq );
+        } else {
+            CORE_zsyssq_up_col( N, A, LDA, sclssq );
         }
     } else {
-
-        for(j=0; j<N; j++) {
-            ptr = (double*) ( A + j * LDA + j);
-
-            /* Diagonal */
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-            ptr++;
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-            tmp = fabs(*ptr);
-            UPDATE( 1., tmp );
-            ptr++;
-#endif
-
-            for(i=j+1; i<N; i++, ptr++) {
-
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-
-#if defined(PRECISION_z) || defined(PRECISION_c)
-                ptr++;
-                tmp = fabs(*ptr);
-                UPDATE( 2., tmp );
-#endif
-            }
+        if ( storev == ChamEltwise ) {
+            CORE_zsyssq_lo_elt( N, A, LDA, sclssq );
+        } else {
+            CORE_zsyssq_lo_col( N, A, LDA, sclssq );
         }
     }
     return CHAMELEON_SUCCESS;
diff --git a/coreblas/eztrace_module/coreblas_eztrace_module b/coreblas/eztrace_module/coreblas_eztrace_module
index 3bca3cbf12c4763eec665bf20a02a23d38ee72ae..389858563abc4cfe8cca9e5d40109f6c2a461e3d 100644
--- a/coreblas/eztrace_module/coreblas_eztrace_module
+++ b/coreblas/eztrace_module/coreblas_eztrace_module
@@ -32,9 +32,9 @@ int  CORE_cgessm(int M, int N, int K, int IB,
                  const int *IPIV,
                  void *L, int LDL,
                  void *A, int LDA);
-int  CORE_cgessq(int M, int N,
+int  CORE_cgessq(cham_store_t storev, int M, int N,
                  void *A, int LDA,
-                 float *scale, float *sumsq);
+                 float *sclssq);
 int CORE_cgetf2_nopiv(int M, int N,
                   void *A, int LDA);
 int  CORE_cgetrf(int M, int N,
@@ -70,9 +70,9 @@ void CORE_cher2k(int uplo, int trans,
                  void *alpha, void *A, int LDA,
                                            void *B, int LDB,
                  float beta,                    void *C, int LDC);
-int  CORE_chessq(int uplo, int N,
+int  CORE_chessq(cham_store_t storev, int uplo, int N,
                  void *A, int LDA,
-                 float *scale, float *sumsq);
+                 float *sclssq);
 
 int  CORE_cherfb(int uplo, int N, int K, int IB, int NB,
                  void *A,    int LDA,
@@ -190,9 +190,9 @@ void CORE_csyr2k(int uplo, int trans,
                  void *alpha, void *A, int LDA,
                                            void *B, int LDB,
                  void *beta,        void *C, int LDC);
-int  CORE_csyssq(int uplo, int N,
+int  CORE_csyssq(cham_store_t storev, int uplo, int N,
                  void *A, int LDA,
-                 float *scale, float *sumsq);
+                 float *sclssq);
 int CORE_csytf2_nopiv(int uplo, int n, void *A, int lda);
 void CORE_ctrmm(int side, int uplo,
                 int transA, int diag,
@@ -334,9 +334,9 @@ int  CORE_dgessm(int M, int N, int K, int IB,
                  const int *IPIV,
                  const double *L, int LDL,
                  double *A, int LDA);
-int  CORE_dgessq(int M, int N,
+int  CORE_dgessq(cham_store_t storev, int M, int N,
                  const double *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int CORE_dgetf2_nopiv(int M, int N,
                   double *A, int LDA);
 int  CORE_dgetrf(int M, int N,
@@ -372,9 +372,9 @@ void CORE_dsyr2k(int uplo, int trans,
                  double alpha, const double *A, int LDA,
                                            const double *B, int LDB,
                  double beta,                    double *C, int LDC);
-int  CORE_dsyssq(int uplo, int N,
+int  CORE_dsyssq(cham_store_t storev, int uplo, int N,
                  const double *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 
 int  CORE_dsyrfb(int uplo, int N, int K, int IB, int NB,
                  const double *A,    int LDA,
@@ -621,9 +621,9 @@ int  CORE_sgessm(int M, int N, int K, int IB,
                  const int *IPIV,
                  const float *L, int LDL,
                  float *A, int LDA);
-int  CORE_sgessq(int M, int N,
+int  CORE_sgessq(cham_store_t storev, int M, int N,
                  const float *A, int LDA,
-                 float *scale, float *sumsq);
+                 float *sclssq);
 int CORE_sgetf2_nopiv(int M, int N,
                   float *A, int LDA);
 int  CORE_sgetrf(int M, int N,
@@ -659,9 +659,9 @@ void CORE_ssyr2k(int uplo, int trans,
                  float alpha, const float *A, int LDA,
                                            const float *B, int LDB,
                  float beta,                    float *C, int LDC);
-int  CORE_ssyssq(int uplo, int N,
+int  CORE_ssyssq(cham_store_t storev, int uplo, int N,
                  const float *A, int LDA,
-                 float *scale, float *sumsq);
+                 float *sclssq);
 
 int  CORE_ssyrfb(int uplo, int N, int K, int IB, int NB,
                  const float *A,    int LDA,
@@ -901,9 +901,9 @@ int  CORE_zgessm(int M, int N, int K, int IB,
                  const int *IPIV,
                  void *L, int LDL,
                  void *A, int LDA);
-int  CORE_zgessq(int M, int N,
+int  CORE_zgessq(cham_store_t storev, int M, int N,
                  void *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int CORE_zgetf2_nopiv(int M, int N,
                   void *A, int LDA);
 int  CORE_zgetrf(int M, int N,
@@ -939,9 +939,9 @@ void CORE_zher2k(int uplo, int trans,
                  void *alpha, void *A, int LDA,
                                            void *B, int LDB,
                  double beta,                    void *C, int LDC);
-int  CORE_zhessq(int uplo, int N,
+int  CORE_zhessq(cham_store_t storev, int uplo, int N,
                  void *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 
 int  CORE_zherfb(int uplo, int N, int K, int IB, int NB,
                  void *A,    int LDA,
@@ -1028,6 +1028,9 @@ void CORE_zplgsy(void *bump, int m, int n, void *A, int lda,
                  int bigM, int m0, int n0, unsigned long long int seed );
 void CORE_zplrnt(int m, int n, void *A, int lda,
                  int bigM, int m0, int n0, unsigned long long int seed );
+int CORE_zplssq( cham_store_t storev, int M, int N,
+                 double *sclssqin, double *sclssqout );
+int CORE_zplssq2( int N, double *sclssq );
 void CORE_zpotrf(int uplo, int N, void *A, int LDA, int *INFO);
 void CORE_zshift(int s, int m, int n, int L,
                  void *A);
@@ -1053,9 +1056,9 @@ void CORE_zsyr2k(int uplo, int trans,
                  void *alpha, void *A, int LDA,
                                            void *B, int LDB,
                  void *beta,        void *C, int LDC);
-int  CORE_zsyssq(int uplo, int N,
+int  CORE_zsyssq(cham_store_t storev, int uplo, int N,
                  void *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int CORE_zsytf2_nopiv(int uplo, int n, void *A, int lda);
 void CORE_ztrmm(int side, int uplo,
                 int transA, int diag,
diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h
index ee434de43d2da6c4219b00f8b8e4236a419adb86..2bd8c01067594c0c85ee2d43aeb6432d5b9ba031 100644
--- a/coreblas/include/coreblas/coreblas_z.h
+++ b/coreblas/include/coreblas/coreblas_z.h
@@ -64,9 +64,9 @@ int  CORE_zgessm(int M, int N, int K, int IB,
                  const int *IPIV,
                  const CHAMELEON_Complex64_t *L, int LDL,
                  CHAMELEON_Complex64_t *A, int LDA);
-int  CORE_zgessq(int M, int N,
+int  CORE_zgessq(cham_store_t storev, int M, int N,
                  const CHAMELEON_Complex64_t *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int CORE_zgetf2_nopiv(int M, int N,
                   CHAMELEON_Complex64_t *A, int LDA);
 int  CORE_zgetrf(int M, int N,
@@ -104,9 +104,9 @@ void CORE_zher2k(cham_uplo_t uplo, cham_trans_t trans,
                  CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
                                            const CHAMELEON_Complex64_t *B, int LDB,
                  double beta,                    CHAMELEON_Complex64_t *C, int LDC);
-int  CORE_zhessq(cham_uplo_t uplo, int N,
+int  CORE_zhessq(cham_store_t storev, cham_uplo_t uplo, int N,
                  const CHAMELEON_Complex64_t *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int  CORE_zherfb(cham_uplo_t uplo, int N, int K, int IB, int NB,
                  const CHAMELEON_Complex64_t *A,    int LDA,
                  const CHAMELEON_Complex64_t *T,    int LDT,
@@ -196,6 +196,9 @@ void CORE_zplgsy(CHAMELEON_Complex64_t bump, int m, int n, CHAMELEON_Complex64_t
                  int bigM, int m0, int n0, unsigned long long int seed );
 void CORE_zplrnt(int m, int n, CHAMELEON_Complex64_t *A, int lda,
                  int bigM, int m0, int n0, unsigned long long int seed );
+int CORE_zplssq( cham_store_t storev, int M, int N,
+                 double *sclssqin, double *sclssqout );
+int CORE_zplssq2( int N, double *sclssq );
 void CORE_zpotrf(cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA, int *INFO);
 void CORE_zshift(int s, int m, int n, int L,
                  CHAMELEON_Complex64_t *A);
@@ -221,9 +224,9 @@ void CORE_zsyr2k(cham_uplo_t uplo, cham_trans_t trans,
                  CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
                                            const CHAMELEON_Complex64_t *B, int LDB,
                  CHAMELEON_Complex64_t beta,        CHAMELEON_Complex64_t *C, int LDC);
-int  CORE_zsyssq(cham_uplo_t uplo, int N,
+int  CORE_zsyssq(cham_store_t storev, cham_uplo_t uplo, int N,
                  const CHAMELEON_Complex64_t *A, int LDA,
-                 double *scale, double *sumsq);
+                 double *sclssq);
 int CORE_zsytf2_nopiv(cham_uplo_t uplo, int n, CHAMELEON_Complex64_t *A, int lda);
 int CORE_ztradd(cham_uplo_t uplo, cham_trans_t trans, int M, int N,
                       CHAMELEON_Complex64_t alpha,
diff --git a/coreblas/include/coreblas/sumsq_update.h b/coreblas/include/coreblas/sumsq_update.h
new file mode 100644
index 0000000000000000000000000000000000000000..1c94e85ba13d540c5b72ad3277937829b9375c5b
--- /dev/null
+++ b/coreblas/include/coreblas/sumsq_update.h
@@ -0,0 +1,132 @@
+/**
+ *
+ * @file sumsq_update.h
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon CPU auxiliary sumsq_update routine
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @date 2019-03-27
+ *
+ */
+#ifndef _sumsq_update_h_
+#define _sumsq_update_h_
+
+/**
+ *******************************************************************************
+ *
+ * @brief Update the couple (scale, sumsq) with one element when
+ * computing the Froebnius norm.
+ *
+ * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to
+ * avoid overflow in the sum square computation.
+ *
+ *******************************************************************************
+ *
+ * @param[inout] scale
+ *           On entry, the former scale
+ *           On exit, the update scale to take into account the value
+ *
+ * @param[inout] sumsq
+ *           On entry, the former sumsq
+ *           On exit, the update sumsq to take into account the value
+ *
+ * @param[in] value
+ *          The value to integrate into the couple (scale, sumsq)
+ *
+ *******************************************************************************/
+static inline void
+#if defined(PRECISION_d) || defined(PRECISION_z)
+sumsq_update( int nb, double *scale, double *sumsq, const double *value )
+{
+    double absval = fabs(*value);
+    double ratio;
+    if ( absval != 0. ){
+        if ( (*scale) < absval ) {
+            ratio = (*scale) / absval;
+            *sumsq = (double)nb + (*sumsq) * ratio * ratio;
+            *scale = absval;
+        } else {
+            ratio = absval / (*scale);
+            *sumsq = (*sumsq) + (double)nb * ratio * ratio;
+        }
+    }
+}
+#elif defined(PRECISION_s) || defined(PRECISION_c)
+sumsq_update( int nb, float *scale, float *sumsq, const float *value )
+{
+    float absval = fabs(*value);
+    float ratio;
+    if ( absval != 0. ){
+        if ( (*scale) < absval ) {
+            ratio = (*scale) / absval;
+            *sumsq = (float)nb + (*sumsq) * ratio * ratio;
+            *scale = absval;
+        } else {
+            ratio = absval / (*scale);
+            *sumsq = (*sumsq) + (float)nb * ratio * ratio;
+        }
+    }
+}
+#endif
+
+/**
+ *******************************************************************************
+ *
+ * @brief Update the couple (scale, sumsq) by adding another couple when
+ * computing the Froebnius norm.
+ *
+ * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to
+ * avoid overflow in the sum square computation.
+ *
+ *******************************************************************************
+ *
+ * @param[inout] scale
+ *           On entry, the former scale
+ *           On exit, the update scale to take into account the value
+ *
+ * @param[inout] sumsq
+ *           On entry, the former sumsq
+ *           On exit, the update sumsq to take into account the value
+ *
+ * @param[in] value
+ *          The value to integrate into the couple (scale, sumsq)
+ *
+ *******************************************************************************/
+static inline void
+#if defined(PRECISION_d) || defined(PRECISION_z)
+sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout )
+{
+    if (*scaleout >= 0.) {
+        if ( (*scaleout) < (*scalein) ) {
+            *sumsqout = *sumsqin  + (*sumsqout * (( *scaleout / *scalein ) * ( *scaleout / *scalein )));
+            *scaleout = *scalein;
+        } else {
+            if ( (*scaleout) > 0. ){
+                *sumsqout = *sumsqout + (*sumsqin * (( *scalein / *scaleout ) * ( *scalein / *scaleout )));
+            }
+        }
+    }
+}
+#elif defined(PRECISION_s) || defined(PRECISION_c)
+sumsq_update_2( int nb, const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout )
+{
+    if (*scaleout >= 0.) {
+        if ( (*scaleout) < (*scalein) ) {
+            *sumsqout = *sumsqin  + (*sumsqout * (( *scaleout / *scalein ) * ( *scaleout / *scalein )));
+            *scaleout = *scalein;
+        } else {
+            if ( (*scaleout) > 0. ){
+                *sumsqout = *sumsqout + (*sumsqin * (( *scalein / *scaleout ) * ( *scalein / *scaleout )));
+            }
+        }
+    }
+}
+#endif
+
+#endif /* _sumsq_update_h_ */
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index 7e623aebf155ab457a8fd15f85e55539553a6967..ec7ea257144d0f1fc0730feb2f5817b98cd956f0 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -151,6 +151,7 @@ typedef enum chameleon_dir_e {
 typedef enum chameleon_store_e {
     ChamColumnwise = 401, /**< Column wise storage  */
     ChamRowwise    = 402, /**< Row wise storage     */
+    ChamEltwise    = 403, /**< Element by element storage */
 } cham_store_t;
 
 
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index c82829c9e78db4c216de1d41eaa0d8793384f37f..2d8ec988208595f60f34b9355fa6f192ba89db6b 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -66,7 +66,7 @@ void INSERT_TASK_zgessm( const RUNTIME_option_t *options,
                          const CHAM_desc_t *D, int Dm, int Dn, int ldd,
                          const CHAM_desc_t *A, int Am, int An, int lda );
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                         int m, int n,
+                         cham_store_t storev, int m, int n,
                          const CHAM_desc_t *A, int Am, int An, int lda,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zgetrf( const RUNTIME_option_t *options,
@@ -112,7 +112,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                         double alpha, const CHAM_desc_t *A, int Am, int An, int lda,
                         double beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int n,
+                         cham_store_t storev, cham_uplo_t uplo, int n,
                          const CHAM_desc_t *A, int Am, int An, int lda,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zlacpy( const RUNTIME_option_t *options,
@@ -172,9 +172,10 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                          int m, int n, const CHAM_desc_t *A, int Am, int An, int lda,
                          int bigM, int m0, int n0, unsigned long long int seed );
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
+                         cham_store_t storev, int M, int N,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
                          const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn );
-void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
+void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn );
 void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n, int nb,
@@ -205,7 +206,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, int lda,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn, int ldc );
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int n,
+                         cham_store_t storev, cham_uplo_t uplo, int n,
                          const CHAM_desc_t *A, int Am, int An, int lda,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn );
 void INSERT_TASK_zsytrf_nopiv( const RUNTIME_option_t *options,
diff --git a/runtime/openmp/codelets/codelet_zgessq.c b/runtime/openmp/codelets/codelet_zgessq.c
index ae15172d441dcd6bbf8475b9c3d841a5acfc2be3..42453eaacfde2ddc798a70059df8d417c7bd1edb 100644
--- a/runtime/openmp/codelets/codelet_zgessq.c
+++ b/runtime/openmp/codelets/codelet_zgessq.c
@@ -25,12 +25,12 @@
 #include "coreblas/coreblas_z.h"
 
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                        int m, int n,
+                        cham_store_t storev, int m, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrScaleSum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn);
-#pragma omp task firstprivate(m, n, ptrA, lda, ptrScaleSum) depend(in:ptrA[0]) depend(inout:ptrScaleSum[0])
-    CORE_zgessq( m, n, ptrA, lda, &ptrScaleSum[0], &ptrScaleSum[1] );
+#pragma omp task firstprivate(storev, m, n, ptrA, lda, ptrScaleSum) depend(in:ptrA[0]) depend(inout:ptrScaleSum[0])
+    CORE_zgessq( storev, m, n, ptrA, lda, ptrScaleSum );
 }
diff --git a/runtime/openmp/codelets/codelet_zhessq.c b/runtime/openmp/codelets/codelet_zhessq.c
index 702095aed7388e453b0b9696b0291fc090392bd8..85232d001a1b29286796d1a2e17e77eb095f310c 100644
--- a/runtime/openmp/codelets/codelet_zhessq.c
+++ b/runtime/openmp/codelets/codelet_zhessq.c
@@ -22,15 +22,13 @@
  */
 #include "chameleon_openmp.h"
 #include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
 
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
+                         cham_store_t storev, cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
-    CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
-    double *ptrScaleSum = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn);
-#pragma omp task firstprivate(uplo, n, ptrA, lda, ptrScaleSum) depend(in:ptrScaleSum[0]) depend(inout:ptrA[0])
-    CORE_zhessq( uplo, n, ptrA, lda, &ptrScaleSum[0], &ptrScaleSum[1] );
+    INSERT_TASK_zsyssq( options, storev, uplo, n,
+                        A, Am, An, lda,
+                        SCALESUMSQ, SCALESUMSQm, SCALESUMSQn );
 }
diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c
index e41484c09702a4f9deddbf2645f904b03ad0a6e4..61196102c2d1ba7646925bb4ff27c2e858e08dd2 100644
--- a/runtime/openmp/codelets/codelet_zplssq.c
+++ b/runtime/openmp/codelets/codelet_zplssq.c
@@ -25,61 +25,22 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_z.h"
 
-/**
- *
- * @ingroup CORE_CHAMELEON_Complex64_t
- *
- * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq)
- *
- * with scl and ssq such that
- *
- *    ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) )
- *                      i
- *
- * The values of A(2*i+1) are assumed to be at least unity.
- * The values of A(2*i) are assumed to be non-negative and scl is
- *
- *    scl = max( A( 2*i ) ),
- *           i
- *
- * The routine makes only one pass through the matrix A.
- *
- *******************************************************************************
- *
- *  @param[in] M
- *          The number of couple (scale, sumsq) in the matrix A.
- *
- *  @param[in] A
- *          The 2-by-M matrix.
- *
- *  @param[out] result
- *          On exit, result contains scl * sqrt( ssq )
- *
- */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
+                         cham_store_t storev, int M, int N,
                          const CHAM_desc_t *IN,  int INm,  int INn,
                          const CHAM_desc_t *OUT, int OUTm, int OUTn )
 {
     double *sclssq_in  = RTBLKADDR(IN,  double, INm,  INn );
     double *sclssq_out = RTBLKADDR(OUT, double, OUTm, OUTn);
-#pragma omp task depend(in: sclssq_in[0]) depend(inout: sclssq_out[0])
-    {
-        if( sclssq_out[0] < sclssq_in[0] ) {
-            sclssq_out[1] = sclssq_in[1]  + (sclssq_out[1] * (( sclssq_out[0] / sclssq_in[0] ) * ( sclssq_out[0] / sclssq_in[0] )));
-            sclssq_out[0] = sclssq_in[0];
-        } else {
-            sclssq_out[1] = sclssq_out[1] + (sclssq_in[1]  * (( sclssq_in[0] / sclssq_out[0] ) * ( sclssq_in[0] / sclssq_out[0] )));
-        }
-    }
+#pragma omp task firstprivate(storev, M, N) depend(in: sclssq_in[0]) depend(inout: sclssq_out[0])
+    CORE_zplssq(storev, M, N, sclssq_in, sclssq_out);
 }
 
-void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
+void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
-    CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, CHAMELEON_Complex64_t, RESULTm, RESULTn);
+    CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn);
 
-#pragma omp task depend(inout: res[0])
-    {
-        res[0] = res[0] * sqrt( res[1] );
-    }
+#pragma omp task firstprivate(N) depend(inout: res[0])
+    CORE_zplssq2(N, res);
 }
diff --git a/runtime/openmp/codelets/codelet_zsyssq.c b/runtime/openmp/codelets/codelet_zsyssq.c
index bc616c81beccd9a2bc9e3711991fe1d2d7d587b6..32a7dc9d52775570d9575755b8d5c05d53945b70 100644
--- a/runtime/openmp/codelets/codelet_zsyssq.c
+++ b/runtime/openmp/codelets/codelet_zsyssq.c
@@ -23,12 +23,12 @@
 #include "chameleon/tasks_z.h"
 
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
+                        cham_store_t storev, cham_uplo_t uplo, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
     CHAMELEON_Complex64_t *ptrA = RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An);
     double *ptrSCALESUMSQ = RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn);
-#pragma omp task firstprivate(uplo, n, ptrA, lda, ptrSCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0])
-    CORE_zsyssq( uplo, n, ptrA, lda, &ptrSCALESUMSQ[0], &ptrSCALESUMSQ[1] );
+#pragma omp task firstprivate(storev, uplo, n, ptrA, lda, ptrSCALESUMSQ) depend(in:ptrA[0]) depend(inout:ptrSCALESUMSQ[0])
+    CORE_zsyssq( storev, uplo, n, ptrA, lda, ptrSCALESUMSQ );
 }
diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c
index eb1ffd6f66d86816206c819274f48265be2f6e88..93b332cbf9117375286077075141c3f4a7f14560 100644
--- a/runtime/parsec/codelets/codelet_zgessq.c
+++ b/runtime/parsec/codelets/codelet_zgessq.c
@@ -25,6 +25,7 @@ static inline int
 CORE_zgessq_parsec( parsec_execution_stream_t *context,
                     parsec_task_t             *this_task )
 {
+    cham_store_t storev;
     int m;
     int n;
     CHAMELEON_Complex64_t *A;
@@ -32,16 +33,16 @@ CORE_zgessq_parsec( parsec_execution_stream_t *context,
     double *SCALESUMSQ;
 
     parsec_dtd_unpack_args(
-        this_task, &m, &n, &A, &lda, &SCALESUMSQ );
+        this_task, &storev, &m, &n, &A, &lda, &SCALESUMSQ );
 
-    CORE_zgessq( m, n, A, lda, SCALESUMSQ, SCALESUMSQ+1 );
+    CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                        int m, int n,
+                        cham_store_t storev, int m, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
@@ -49,10 +50,11 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
 
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zgessq_parsec, options->priority, "gessq",
-        sizeof(int),    &m,            VALUE,
-        sizeof(int),    &n,            VALUE,
+        sizeof(cham_store_t), &storev,       VALUE,
+        sizeof(int),          &m,            VALUE,
+        sizeof(int),          &n,            VALUE,
         PASSED_BY_REF,   RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
-        sizeof(int),    &lda,          VALUE,
+        sizeof(int),          &lda,          VALUE,
         PASSED_BY_REF,   RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
diff --git a/runtime/parsec/codelets/codelet_zhessq.c b/runtime/parsec/codelets/codelet_zhessq.c
index 238e56bcbc80cb0a74b6d55b0049bb71d32a3072..2229c62eb91c03e52cbc848f482b2702a40a4c0c 100644
--- a/runtime/parsec/codelets/codelet_zhessq.c
+++ b/runtime/parsec/codelets/codelet_zhessq.c
@@ -19,40 +19,13 @@
  */
 #include "chameleon_parsec.h"
 #include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-static inline int
-CORE_zhessq_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
-{
-    cham_uplo_t uplo;
-    int n;
-    CHAMELEON_Complex64_t *A;
-    int lda;
-    double *SCALESUMSQ;
-
-    parsec_dtd_unpack_args(
-        this_task, &uplo, &n, &A, &lda, &SCALESUMSQ );
-
-    CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]);
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
-}
 
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
+                         cham_store_t storev, cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zhessq_parsec, options->priority, "hessq",
-        sizeof(int),           &uplo,               VALUE,
-        sizeof(int),           &n,                  VALUE,
-        PASSED_BY_REF,         RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
-        sizeof(int),           &lda,                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ),    INOUT | AFFINITY,
-        PARSEC_DTD_ARG_END );
+    INSERT_TASK_zsyssq( options, storev, uplo, n,
+                        A, Am, An, lda,
+                        SCALESUMSQ, SCALESUMSQm, SCALESUMSQn );
 }
diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c
index 6c797829395817c83819a5cca0ab53601d25e295..16f19a21fad3581ec4a170d7fae3a90b657d5350 100644
--- a/runtime/parsec/codelets/codelet_zplssq.c
+++ b/runtime/parsec/codelets/codelet_zplssq.c
@@ -26,58 +26,23 @@ static inline int
 CORE_zplssq_parsec( parsec_execution_stream_t *context,
                     parsec_task_t             *this_task )
 {
+    cham_store_t storev;
+    int M;
+    int N;
     double *SCLSSQ_IN;
     double *SCLSSQ_OUT;
 
     parsec_dtd_unpack_args(
-        this_task, &SCLSSQ_IN, &SCLSSQ_OUT );
+        this_task, &storev, &M, &N, &SCLSSQ_IN, &SCLSSQ_OUT );
 
-    assert( SCLSSQ_OUT[0] >= 0. );
-    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
-        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
-        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
-    } else {
-        if ( SCLSSQ_OUT[0] > 0 ) {
-            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
-        }
-    }
+    CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT);
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq)
- *
- * with scl and ssq such that
- *
- *    ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) )
- *                      i
- *
- * The values of A(2*i+1) are assumed to be at least unity.
- * The values of A(2*i) are assumed to be non-negative and scl is
- *
- *    scl = max( A( 2*i ) ),
- *           i
- *
- * The routine makes only one pass through the matrix A.
- *
- *******************************************************************************
- *
- *  @param[in] M
- *          The number of couple (scale, sumsq) in the matrix A.
- *
- *  @param[in] A
- *          The 2-by-M matrix.
- *
- *  @param[out] result
- *          On exit, result contains scl * sqrt( ssq )
- *
- */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
+                         cham_store_t storev, int M, int N,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
                          const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn )
 {
@@ -85,6 +50,9 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
 
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zplssq_parsec, options->priority, "plssq",
+        sizeof(int),           &storev,                           VALUE,
+        sizeof(int),           &M,                                VALUE,
+        sizeof(int),           &N,                                VALUE,
         PASSED_BY_REF,         RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ),    INPUT,
         PASSED_BY_REF,         RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ),                INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
@@ -94,24 +62,26 @@ static inline int
 CORE_zplssq2_parsec( parsec_execution_stream_t *context,
                      parsec_task_t             *this_task )
 {
+    int N;
     double *RESULT;
 
     parsec_dtd_unpack_args(
-        this_task, &RESULT );
+        this_task, &N, &RESULT );
 
-    RESULT[0] = RESULT[0] * sqrt( RESULT[1] );
+    CORE_zplssq2(N, RESULT);
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
+void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
 
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zplssq2_parsec, options->priority, "plssq2",
+        sizeof(int),           &N,                                VALUE,
         PASSED_BY_REF,         RTBLKADDR( RESULT, double, RESULTm, RESULTn ),     INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c
index 9f06c6d4c35769048c1817d92fafe694d5543f09..8e86035ccbaafd400985c1ffea92b9e27643e588 100644
--- a/runtime/parsec/codelets/codelet_zsyssq.c
+++ b/runtime/parsec/codelets/codelet_zsyssq.c
@@ -25,6 +25,7 @@ static inline int
 CORE_zsyssq_parsec( parsec_execution_stream_t *context,
                     parsec_task_t             *this_task )
 {
+    cham_store_t storev;
     cham_uplo_t uplo;
     int n;
     CHAMELEON_Complex64_t *A;
@@ -32,16 +33,16 @@ CORE_zsyssq_parsec( parsec_execution_stream_t *context,
     double *SCALESUMSQ;
 
     parsec_dtd_unpack_args(
-        this_task, &uplo, &n, &A, &lda, &SCALESUMSQ );
+        this_task, &storev, &uplo, &n, &A, &lda, &SCALESUMSQ );
 
-    CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]);
+    CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
+                        cham_store_t storev, cham_uplo_t uplo, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
@@ -49,7 +50,8 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
 
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zsyssq_parsec, options->priority, "syssq",
-        sizeof(int),     &uplo,                  VALUE,
+        sizeof(cham_store_t),   &storev,                VALUE,
+        sizeof(int),            &uplo,                  VALUE,
         sizeof(int),            &n,                     VALUE,
         PASSED_BY_REF,          RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
         sizeof(int),            &lda,                   VALUE,
diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c
index f63b6d99a6fc597a6b97549d24860a839f6364eb..bf5396e4206dbd5f9389377ac11a16c45b4e24d7 100644
--- a/runtime/quark/codelets/codelet_zgessq.c
+++ b/runtime/quark/codelets/codelet_zgessq.c
@@ -25,27 +25,29 @@
 
 void CORE_zgessq_quark(Quark *quark)
 {
+    cham_store_t storev;
     int m;
     int n;
     CHAMELEON_Complex64_t *A;
     int lda;
     double *SCALESUMSQ;
 
-    quark_unpack_args_5( quark, m, n, A, lda, SCALESUMSQ );
-    CORE_zgessq( m, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]);
+    quark_unpack_args_6( quark, storev, m, n, A, lda, SCALESUMSQ );
+    CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ );
 }
 
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                        int m, int n,
+                        cham_store_t storev, int m, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt,
-                      sizeof(int),                     &m,    VALUE,
-                      sizeof(int),                     &n,    VALUE,
+                      sizeof(cham_store_t),            &storev, VALUE,
+                      sizeof(int),                     &m,      VALUE,
+                      sizeof(int),                     &n,      VALUE,
                       sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
-                      sizeof(int),                     &lda,  VALUE,
+                      sizeof(int),                     &lda,    VALUE,
                       sizeof(double)*2,                RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
                       0);
 }
diff --git a/runtime/quark/codelets/codelet_zhessq.c b/runtime/quark/codelets/codelet_zhessq.c
index 5d797c984297b70b6bbbe570825ba76e069859f9..b66dfd60a9dcda3747993cacec4b4d33b22e6370 100644
--- a/runtime/quark/codelets/codelet_zhessq.c
+++ b/runtime/quark/codelets/codelet_zhessq.c
@@ -21,31 +21,13 @@
  */
 #include "chameleon_quark.h"
 #include "chameleon/tasks_z.h"
-#include "coreblas/coreblas_z.h"
-
-void CORE_zhessq_quark(Quark *quark)
-{
-    cham_uplo_t uplo;
-    int n;
-    CHAMELEON_Complex64_t *A;
-    int lda;
-    double *SCALESUMSQ;
-
-    quark_unpack_args_5( quark, uplo, n, A, lda, SCALESUMSQ );
-    CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]);
-}
 
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
+                         cham_store_t storev, cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-    QUARK_Insert_Task(opt->quark, CORE_zhessq_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &uplo, VALUE,
-        sizeof(int),                     &n,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
-        sizeof(int),                     &lda,  VALUE,
-        sizeof(double)*2,                RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
-        0);
+    INSERT_TASK_zsyssq( options, storev, uplo, n,
+                        A, Am, An, lda,
+                        SCALESUMSQ, SCALESUMSQm, SCALESUMSQn );
 }
diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c
index 8744b7b3ec319ce347abd8f40d101cb1a92b3e07..70810682c9a89cfba122782dbdff65149a17a09e 100644
--- a/runtime/quark/codelets/codelet_zplssq.c
+++ b/runtime/quark/codelets/codelet_zplssq.c
@@ -26,20 +26,15 @@
 
 void CORE_zplssq_quark(Quark *quark)
 {
+    cham_store_t storev;
+    int M;
+    int N;
     double *SCLSSQ_IN;
     double *SCLSSQ_OUT;
 
-    quark_unpack_args_2( quark, SCLSSQ_IN, SCLSSQ_OUT );
+    quark_unpack_args_5( quark, storev, M, N, SCLSSQ_IN, SCLSSQ_OUT );
 
-    assert( SCLSSQ_OUT[0] >= 0. );
-    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
-        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
-        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
-    } else {
-        if ( SCLSSQ_OUT[0] > 0 ) {
-            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
-        }
-    }
+    CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT);
 }
 
 /**
@@ -74,30 +69,48 @@ void CORE_zplssq_quark(Quark *quark)
  *
  */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
+                         cham_store_t storev, int M, int N,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
                          const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn )
 {
+    int sizein = M*N;
+    int sizeout;
+
+    if ( storev == ChamColumnwise ) {
+        sizeout = 2*N;
+    } else if ( storev == ChamRowwise ) {
+        sizeout = 2*M;
+    } else {
+        sizeout = 2;
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt,
-        sizeof(double)*2, RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INPUT,
-        sizeof(double)*2, RTBLKADDR(SCLSSQ,     double, SCLSSQm,     SCLSSQn),     INOUT,
+        sizeof(int),            &storev,    VALUE,
+        sizeof(int),            &M,         VALUE,
+        sizeof(int),            &N,         VALUE,
+        sizeof(double)*sizein,  RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INPUT,
+        sizeof(double)*sizeout, RTBLKADDR(SCLSSQ,     double, SCLSSQm,     SCLSSQn),     INOUT,
         0);
 }
 
 
 void CORE_zplssq2_quark(Quark *quark)
 {
+    int N;
     double *RESULT;
 
-    quark_unpack_args_1( quark, RESULT );
-    RESULT[0] = RESULT[0] * sqrt( RESULT[1] );
+    quark_unpack_args_2( quark, N, RESULT );
+
+    CORE_zplssq2(N, RESULT);
 }
 
-void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
+void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt,
-        sizeof(double)*2, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT,
+        sizeof(int),        &N,         VALUE,
+        sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c
index 9611e2082f9fd7c971f196d3318c66edb4353a31..cf1f39442c015139d869ae1fc08a5e4a79bd83a9 100644
--- a/runtime/quark/codelets/codelet_zsyssq.c
+++ b/runtime/quark/codelets/codelet_zsyssq.c
@@ -25,24 +25,26 @@
 
 void CORE_zsyssq_quark(Quark *quark)
 {
+    cham_store_t storev;
     cham_uplo_t uplo;
     int n;
     CHAMELEON_Complex64_t *A;
     int lda;
     double *SCALESUMSQ;
 
-    quark_unpack_args_5( quark, uplo, n, A, lda, SCALESUMSQ );
-    CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1]);
+    quark_unpack_args_6( quark, storev, uplo, n, A, lda, SCALESUMSQ );
+    CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ );
 }
 
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
+                        cham_store_t storev, cham_uplo_t uplo, int n,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
     QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt,
-        sizeof(int),              &uplo, VALUE,
+        sizeof(cham_store_t),            &storev, VALUE,
+        sizeof(int),                     &uplo, VALUE,
         sizeof(int),                     &n,    VALUE,
         sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
         sizeof(int),                     &lda,  VALUE,
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 35aea3122a2aa896ef9044c3a37c13191235296f..dddcb6994da86b889f7d484828d86a9d71cc9814 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -54,8 +54,8 @@ CHAMELEON_CL_CB(zsytrf_nopiv,  starpu_matrix_get_nx(task->handles[0]), 0, 0,
 CHAMELEON_CL_CB(zplgsy,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                                M*N)
 CHAMELEON_CL_CB(zplrnt,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                                M*N)
 CHAMELEON_CL_CB(zbuild,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                                M*N)
-CHAMELEON_CL_CB(zplssq,                                             1,                                      1, 0,                                                4)
-CHAMELEON_CL_CB(zplssq2,                                            1,                                      1, 0,                                                1)
+CHAMELEON_CL_CB(zplssq,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                                M*N)
+CHAMELEON_CL_CB(zplssq2,       starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                                2*N)
 CHAMELEON_CL_CB(zpotrf,        starpu_matrix_get_nx(task->handles[0]), 0, 0,                                                                           (1./3.)*M* M*M)
 CHAMELEON_CL_CB(zssssm,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), M*M*(2.*M+starpu_matrix_get_nx(task->handles[2])))
 CHAMELEON_CL_CB(zsymm,         starpu_matrix_get_nx(task->handles[2]), starpu_matrix_get_ny(task->handles[2]), 0,                                           2.*M*M *N)
diff --git a/runtime/starpu/codelets/codelet_zgessq.c b/runtime/starpu/codelets/codelet_zgessq.c
index 3c37cb1c6858ab4af43475fd19e17f199d2cea65..c9141aefbd8aaa67be87429b7e0f29a34e2344eb 100644
--- a/runtime/starpu/codelets/codelet_zgessq.c
+++ b/runtime/starpu/codelets/codelet_zgessq.c
@@ -25,6 +25,7 @@
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgessq_cpu_func(void *descr[], void *cl_arg)
 {
+    cham_store_t storev;
     int m;
     int n;
     CHAMELEON_Complex64_t *A;
@@ -33,8 +34,8 @@ static void cl_zgessq_cpu_func(void *descr[], void *cl_arg)
 
     A          = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
     SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &lda);
-    CORE_zgessq( m, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] );
+    starpu_codelet_unpack_args(cl_arg, &storev, &m, &n, &lda);
+    CORE_zgessq( storev, m, n, A, lda, SCALESUMSQ );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -44,7 +45,7 @@ static void cl_zgessq_cpu_func(void *descr[], void *cl_arg)
 CODELETS_CPU(zgessq, 2, cl_zgessq_cpu_func)
 
 void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
-                         int m, int n,
+                         cham_store_t storev, int m, int n,
                          const CHAM_desc_t *A, int Am, int An, int lda,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
@@ -58,6 +59,7 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
 
     starpu_insert_task(
         starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &storev,                     sizeof(cham_store_t),
         STARPU_VALUE,    &m,                          sizeof(int),
         STARPU_VALUE,    &n,                          sizeof(int),
         STARPU_R,        RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
diff --git a/runtime/starpu/codelets/codelet_zhessq.c b/runtime/starpu/codelets/codelet_zhessq.c
index 724f5e6fba78f76a62541b8e1a0e40f207716fe8..c1b18c8bc388df6890ff44a1e74058036987451c 100644
--- a/runtime/starpu/codelets/codelet_zhessq.c
+++ b/runtime/starpu/codelets/codelet_zhessq.c
@@ -22,51 +22,12 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zhessq_cpu_func(void *descr[], void *cl_arg)
-{
-    cham_uplo_t uplo;
-    int n;
-    CHAMELEON_Complex64_t *A;
-    int lda;
-    double *SCALESUMSQ;
-
-    A          = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
-    SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
-    starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda);
-    CORE_zhessq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] );
-}
-#endif /* !defined(CHAMELEON_SIMULATION) */
-
-/*
- * Codelet definition
- */
-CODELETS_CPU(zhessq, 2, cl_zhessq_cpu_func)
-
 void INSERT_TASK_zhessq( const RUNTIME_option_t *options,
-                        cham_uplo_t uplo, int n,
-                        const CHAM_desc_t *A, int Am, int An, int lda,
-                        const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
+                         cham_store_t storev, cham_uplo_t uplo, int n,
+                         const CHAM_desc_t *A, int Am, int An, int lda,
+                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
-    struct starpu_codelet *codelet = &cl_zhessq;
-    void (*callback)(void*) = options->profiling ? cl_zgessq_callback : NULL;
-
-    CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_R(A, Am, An);
-    CHAMELEON_ACCESS_RW(SCALESUMSQ, SCALESUMSQm, SCALESUMSQn);
-    CHAMELEON_END_ACCESS_DECLARATION;
-
-    starpu_insert_task(
-        starpu_mpi_codelet(codelet),
-        STARPU_VALUE,    &uplo,                       sizeof(int),
-        STARPU_VALUE,    &n,                          sizeof(int),
-        STARPU_R,        RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_VALUE,    &lda,                        sizeof(int),
-        STARPU_RW,       RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn),
-        STARPU_PRIORITY, options->priority,
-        STARPU_CALLBACK, callback,
-#if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zhessq",
-#endif
-        0);
+    INSERT_TASK_zsyssq( options, storev, uplo, n,
+                        A, Am, An, lda,
+                        SCALESUMSQ, SCALESUMSQm, SCALESUMSQn );
 }
diff --git a/runtime/starpu/codelets/codelet_zplssq.c b/runtime/starpu/codelets/codelet_zplssq.c
index 683aedf2f070bccb037af1139abdd9d3c24ab15c..96a41eb75bf9e30d9ad939acd8c29f89c71cc7c0 100644
--- a/runtime/starpu/codelets/codelet_zplssq.c
+++ b/runtime/starpu/codelets/codelet_zplssq.c
@@ -26,21 +26,17 @@
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zplssq_cpu_func(void *descr[], void *cl_arg)
 {
+    cham_store_t storev;
+    int M;
+    int N;
     double *SCLSSQ_IN;
     double *SCLSSQ_OUT;
 
+    starpu_codelet_unpack_args(cl_arg, &storev, &M, &N);
     SCLSSQ_IN  = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
     SCLSSQ_OUT = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
 
-    assert( SCLSSQ_OUT[0] >= 0. );
-    if( SCLSSQ_OUT[0] < SCLSSQ_IN[0] ) {
-        SCLSSQ_OUT[1] = SCLSSQ_IN[1]  + (SCLSSQ_OUT[1] * (( SCLSSQ_OUT[0] / SCLSSQ_IN[0] ) * ( SCLSSQ_OUT[0] / SCLSSQ_IN[0] )));
-        SCLSSQ_OUT[0] = SCLSSQ_IN[0];
-    } else {
-        if ( SCLSSQ_OUT[0] > 0 ) {
-            SCLSSQ_OUT[1] = SCLSSQ_OUT[1] + (SCLSSQ_IN[1]  * (( SCLSSQ_IN[0] / SCLSSQ_OUT[0] ) * ( SCLSSQ_IN[0] / SCLSSQ_OUT[0] )));
-        }
-    }
+    CORE_zplssq(storev, M, N, SCLSSQ_IN, SCLSSQ_OUT);
 
     (void)cl_arg;
 }
@@ -51,40 +47,10 @@ static void cl_zplssq_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zplssq, 2, cl_zplssq_cpu_func)
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- * @brief Compute sum( a_ij ^ 2 ) = scl * sqrt(ssq)
- *
- * with scl and ssq such that
- *
- *    ( scl**2 )*ssq = sum( A( 2*i )**2 * A( 2*i+1 ) )
- *                      i
- *
- * The values of A(2*i+1) are assumed to be at least unity.
- * The values of A(2*i) are assumed to be non-negative and scl is
- *
- *    scl = max( A( 2*i ) ),
- *           i
- *
- * The routine makes only one pass through the matrix A.
- *
- *******************************************************************************
- *
- *  @param[in] M
- *          The number of couple (scale, sumsq) in the matrix A.
- *
- *  @param[in] A
- *          The 2-by-M matrix.
- *
- *  @param[out] result
- *          On exit, result contains scl * sqrt( ssq )
- *
- */
 void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
-                        const CHAM_desc_t *SCLSSQ_IN,  int SCLSSQ_INm,  int SCLSSQ_INn,
-                        const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn )
+                         cham_store_t storev, int M, int N,
+                         const CHAM_desc_t *SCLSSQ_IN,  int SCLSSQ_INm,  int SCLSSQ_INn,
+                         const CHAM_desc_t *SCLSSQ_OUT, int SCLSSQ_OUTm, int SCLSSQ_OUTn )
 {
     struct starpu_codelet *codelet = &cl_zplssq;
     void (*callback)(void*) = options->profiling ? cl_zplssq_callback : NULL;
@@ -96,6 +62,9 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
 
     starpu_insert_task(
         starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &storev,            sizeof(int),
+        STARPU_VALUE,    &M,                 sizeof(int),
+        STARPU_VALUE,    &N,                 sizeof(int),
         STARPU_R,  RTBLKADDR( SCLSSQ_IN,  double, SCLSSQ_INm,  SCLSSQ_INn  ),
         STARPU_RW, RTBLKADDR( SCLSSQ_OUT, double, SCLSSQ_OUTm, SCLSSQ_OUTn ),
         STARPU_PRIORITY,    options->priority,
@@ -109,11 +78,13 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zplssq2_cpu_func(void *descr[], void *cl_arg)
 {
+    int N;
     double *RESULT;
 
+    starpu_codelet_unpack_args(cl_arg, &N);
     RESULT = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
 
-    RESULT[0] = RESULT[0] * sqrt( RESULT[1] );
+    CORE_zplssq2(N, RESULT);
 
     (void)cl_arg;
 }
@@ -124,7 +95,7 @@ static void cl_zplssq2_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zplssq2, 1, cl_zplssq2_cpu_func)
 
-void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
+void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
     struct starpu_codelet *codelet = &cl_zplssq2;
@@ -136,6 +107,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options,
 
     starpu_insert_task(
         starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &N,                 sizeof(int),
         STARPU_RW, RTBLKADDR(RESULT, double, RESULTm, RESULTn),
         STARPU_PRIORITY,    options->priority,
         STARPU_CALLBACK,    callback,
diff --git a/runtime/starpu/codelets/codelet_zsyssq.c b/runtime/starpu/codelets/codelet_zsyssq.c
index 543b6e1b6b5ef298c47f9dee64c1481596df35a5..30980d669565b05567026c20649459be00ac8549 100644
--- a/runtime/starpu/codelets/codelet_zsyssq.c
+++ b/runtime/starpu/codelets/codelet_zsyssq.c
@@ -25,6 +25,7 @@
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg)
 {
+    cham_store_t storev;
     cham_uplo_t uplo;
     int n;
     CHAMELEON_Complex64_t *A;
@@ -33,8 +34,8 @@ static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg)
 
     A          = (CHAMELEON_Complex64_t *)STARPU_MATRIX_GET_PTR(descr[0]);
     SCALESUMSQ = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
-    starpu_codelet_unpack_args(cl_arg, &uplo, &n, &lda);
-    CORE_zsyssq( uplo, n, A, lda, &SCALESUMSQ[0], &SCALESUMSQ[1] );
+    starpu_codelet_unpack_args(cl_arg, &storev, &uplo, &n, &lda);
+    CORE_zsyssq( storev, uplo, n, A, lda, SCALESUMSQ );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -44,7 +45,7 @@ static void cl_zsyssq_cpu_func(void *descr[], void *cl_arg)
 CODELETS_CPU(zsyssq, 2, cl_zsyssq_cpu_func)
 
 void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
-                         cham_uplo_t uplo, int n,
+                         cham_store_t storev, cham_uplo_t uplo, int n,
                          const CHAM_desc_t *A, int Am, int An, int lda,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
@@ -58,6 +59,7 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
 
     starpu_insert_task(
         starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &storev,                     sizeof(cham_store_t),
         STARPU_VALUE,    &uplo,                       sizeof(int),
         STARPU_VALUE,    &n,                          sizeof(int),
         STARPU_R,        RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),