diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index 4608e59b8a50b4ed692ab9e62e7d17ac47b4a9ed..51fcb8ec288d1acc4162d3ca5a8d77ae25f6bb56 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -14,6 +14,7 @@ _extra_blas = [
     ('',                     'she2ge',               'dhe2ge',               'che2ge',               'zhe2ge'              ),
     ('',                     'slatro',               'dlatro',               'clatro',               'zlatro'              ), #=> Replace by getmo/gecmo as in essl
     ('',                     'sbuild',               'dbuild',               'cbuild',               'zbuild'              ), #=> Replace by map function
+    ('',                     'sgram',                'dgram',                'cgram',                'zgram'               ),
 ]
 
 _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ]
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 96493134cf99cb919e819f4c7543e4dff94b2370..3ca2aaaa1adc53980cc7d8496719df735decbc52 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -242,6 +242,11 @@ set(ZSRC
     ##################
     zbuild.c
     pzbuild.c
+    ##################
+    # Gram
+    ##################
+    zgram.c
+    pzgram.c
 )
 
 precisions_rules_py(CHAMELEON_SRCS_GENERATED "${ZSRC}"
diff --git a/compute/pzgram.c b/compute/pzgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..c8fc0b34e4e38043bfc8468c747fa0f2b1fd5a0f
--- /dev/null
+++ b/compute/pzgram.c
@@ -0,0 +1,209 @@
+/**
+ *
+ * @file pzgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram parallel algorithm
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-10
+ * @precisions normal z -> s d c z
+ *
+ */
+#include "control/common.h"
+
+#define A( m, n )        A,     (m), (n)
+#define W( desc, m, n ) (desc), (m), (n)
+
+static inline void
+chameleon_pzgram_internal( cham_uplo_t uplo,
+                           CHAM_desc_t *A,
+                           CHAM_desc_t *Wcol,
+                           CHAM_desc_t *Welt,
+                           RUNTIME_option_t *options )
+{
+    int m, n;
+    int MT = A->mt;
+    int NT = A->nt;
+    int M  = A->m;
+    int N  = A->n;
+    int P  = Welt->p;
+    int Q  = Welt->q;
+
+    /**
+     *  1) compute (scl,ssq) over columns in each tile
+     */
+    for(n = 0; n < NT; n++) {
+        int mmin = ( uplo == ChamLower ) ? n                      : 0;
+        int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT;
+        int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
+
+        for(m = mmin; m < mmax; m++) {
+            int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
+            int ldam = BLKLDD( A, m );
+
+            if ( n == m ) {
+                INSERT_TASK_dsyssq(
+                    options, ChamColumnwise, uplo, tempmm,
+                    A(m, n), ldam, W( Wcol, m, n) );
+            }
+            else {
+                INSERT_TASK_dgessq(
+                    options, ChamColumnwise, tempmm, tempnn,
+                    A(m, n), ldam, W( Wcol, m, n) );
+                if ( uplo != ChamUpperLower ) {
+                    INSERT_TASK_dgessq(
+                        options, ChamRowwise, tempmm, tempnn,
+                        A(m, n), ldam, W( Wcol, n, m) );
+                }
+            }
+        }
+    }
+
+    for(n = 0; n < NT; n++) {
+        int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
+
+        /**
+         *  2) reduce columns (scl,ssq) tiles per processus (between lines)
+         */
+        for(m = P; m < MT; m++) {
+            INSERT_TASK_dplssq(
+                options, ChamColumnwise, 1, tempnn,
+                W( Wcol, m,   n ),
+                W( Wcol, m%P, n ) );
+        }
+
+        /**
+         *  3) reduce columns (scl,ssq) tiles on the first line of tiles
+         */
+        for(m = 1; m < P; m++) {
+            INSERT_TASK_dplssq(
+                options, ChamColumnwise, 1, tempnn,
+                W( Wcol, m, n ),
+                W( Wcol, 0, n ) );
+        }
+
+        /* 4) reduce (scl,ssq) inside each tile of the first line of tiles for the global sum square */
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, tempnn,
+            W( Wcol, 0, n ),
+            W( Welt, 0, n ) );
+
+        /* 5) deduce the sum square for each column from the pairs (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */
+        INSERT_TASK_dplssq2( options, tempnn, W( Wcol, 0, n ) );
+    }
+
+    /* 6) reduce global sum squares on each processus (between columns) */
+    for(n = Q; n < NT; n++) {
+        INSERT_TASK_dplssq( options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, n%Q) );
+    }
+
+    /* 7) reduce global sum squares on the first tile (index 0, 0) */
+    for(n = 1; n < Q; n++) {
+        INSERT_TASK_dplssq(
+            options, ChamEltwise, 1, 1, W( Welt, 0, n), W( Welt, 0, 0) );
+    }
+
+    /* 8) deduce the global sum square from the pair (scl,ssq) -> sqrt(sum) = scl*sqrt(ssq) */
+    INSERT_TASK_dplssq2( options, 1, W( Welt, 0, 0) );
+
+    /* Finally compute Gram matrix coefficients inplace */
+    for(n = 0; n < NT; n++) {
+        int mmin = ( uplo == ChamLower ) ? n                      : 0;
+        int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, MT) : MT;
+        int tempnn = ( n == (NT-1) ) ? N - n * A->nb : A->nb;
+
+        for(m = mmin; m < mmax; m++) {
+            int tempmm = ( m == (MT-1) ) ? M - m * A->mb : A->mb;
+            int ldam = BLKLDD( A, m );
+
+            INSERT_TASK_zgram(
+                options,
+                ( m == n ) ? uplo : ChamUpperLower,
+                A->m, A->n, tempmm, tempnn,
+                W( Wcol, 0, m ), 2,
+                W( Wcol, 0, n ), 2,
+                W( Welt, 0, 0 ),
+                A( m, n ), ldam );
+        }
+    }
+}
+
+/**
+ *
+ */
+void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+    CHAM_desc_t Wcol;
+    CHAM_desc_t Welt;
+
+    int workmt, worknt;
+    int m, n, tempmm, tempnn, ldw;
+
+    chamctxt = chameleon_context_self();
+    if ( sequence->status != CHAMELEON_SUCCESS ) {
+        return;
+    }
+    RUNTIME_options_init(&options, chamctxt, sequence, request);
+
+    workmt = chameleon_max( A->mt, A->p );
+    worknt = chameleon_max( A->nt, A->q );
+
+    RUNTIME_options_ws_alloc( &options, 1, 0 );
+
+    chameleon_desc_init( &Wcol, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, A->nb, 2*A->nb,
+                         2*workmt, A->n, 0, 0, 2*workmt, A->n, A->p, A->q,
+                         NULL, NULL, NULL );
+
+    chameleon_desc_init( &Welt, CHAMELEON_MAT_ALLOC_TILE, ChamRealDouble, 2, 1, 2,
+                         2, worknt, 0, 0, 2, worknt, A->p, A->q,
+                         NULL, NULL, NULL );
+
+    /* Initialize Wcol */
+    for(m = 0; m < Wcol.mt; m++) {
+        tempmm = m == Wcol.mt-1 ? Wcol.m-m*Wcol.mb : Wcol.mb;
+        ldw = Wcol.get_blkldd(&Wcol, m);
+        for(n = 0; n < Wcol.nt; n++) {
+            tempnn = n == Wcol.nt-1 ? Wcol.n-n*Wcol.nb : Wcol.nb;
+            INSERT_TASK_dlaset(
+                &options,
+                ChamUpperLower, tempmm, tempnn,
+                -1., -1.,
+                W( &Wcol, m, n ), ldw );
+        }
+    }
+    /* Initialize Welt */
+    for(m = 0; m < Welt.mt; m++) {
+        tempmm = m == Welt.mt-1 ? Welt.m-m*Welt.mb : Welt.mb;
+        ldw = Welt.get_blkldd(&Welt, m);
+        for(n = 0; n < Welt.nt; n++) {
+            tempnn = n == Welt.nt-1 ? Welt.n-n*Welt.nb : Welt.nb;
+            INSERT_TASK_dlaset(
+                &options,
+                ChamUpperLower, tempmm, tempnn,
+                -1., -1.,
+                W( &Welt, m, n ), ldw );
+        }
+    }
+
+    chameleon_pzgram_internal( uplo, A, &Wcol, &Welt, &options );
+
+    CHAMELEON_Desc_Flush( &Wcol, sequence );
+    CHAMELEON_Desc_Flush( &Welt, sequence );
+    CHAMELEON_Desc_Flush( A, sequence );
+    RUNTIME_sequence_wait( chamctxt, sequence );
+
+    chameleon_desc_destroy( &Wcol );
+    chameleon_desc_destroy( &Welt );
+
+    RUNTIME_options_ws_free(&options);
+    RUNTIME_options_finalize(&options, chamctxt);
+}
diff --git a/compute/zgram.c b/compute/zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..602856a52cc43e427a377cef676135db1e9959b1
--- /dev/null
+++ b/compute/zgram.c
@@ -0,0 +1,236 @@
+/**
+ *
+ * @file zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram wrappers
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-10
+ * @precisions normal z -> s d c z
+ *
+ */
+#include "control/common.h"
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ *  CHAMELEON_zgram replace a general matrix by the Gram matrix inplace
+ *
+ *    \f[
+ *    d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\
+ *    A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2)
+ *    \f]
+ *
+ *******************************************************************************
+ *
+ * @param[in] N
+ *          The number of lines and columns of the matrix A. N >= 0. When N = 0,
+ *          the returned value is set to zero.
+ *
+ * @param[in] A
+ *          The N-by-N matrix A.
+ *
+ * @param[in] LDA
+ *          The leading dimension of the array A. LDA >= max(1,N).
+ *
+ *******************************************************************************
+ *
+* @retval CHAMELEON_SUCCESS successful exit
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zgram_Tile
+ * @sa CHAMELEON_zgram_Tile_Async
+ * @sa CHAMELEON_sgram
+ *
+ */
+int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA )
+{
+    int NB;
+    int status;
+    CHAM_context_t *chamctxt;
+    RUNTIME_sequence_t *sequence = NULL;
+    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    CHAM_desc_t descAl, descAt;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zgram", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    /* Check input arguments */
+    if (N < 0) {
+        chameleon_error("CHAMELEON_zgram", "illegal value of N");
+        return -1;
+    }
+    if (LDA < chameleon_max(1, N)) {
+        chameleon_error("CHAMELEON_zgram", "illegal value of LDA");
+        return -3;
+    }
+
+    /* Quick return */
+    if (N == 0)
+        return CHAMELEON_SUCCESS;
+
+    /* Tune NB depending on M, N & NRHS; Set NBNB */
+    status = chameleon_tune(CHAMELEON_FUNC_DGEMM, N, N, 0);
+    if (status != CHAMELEON_SUCCESS) {
+        chameleon_error("CHAMELEON_zgram", "chameleon_tune() failed");
+        return status;
+    }
+
+    /* Set NT */
+    NB = CHAMELEON_NB;
+
+    chameleon_sequence_create( chamctxt, &sequence );
+
+    /* Submit the matrix conversion */
+    chameleon_zlap2tile( chamctxt, &descAl, &descAt, ChamDescInout, uplo,
+                         A, NB, NB, LDA, N, N, N, sequence, &request );
+
+    /* Call the tile interface */
+    CHAMELEON_zgram_Tile_Async( uplo, &descAt, sequence, &request );
+
+    /* Submit the matrix conversion back */
+    chameleon_ztile2lap( chamctxt, &descAl, &descAt,
+                         ChamDescInout, uplo, sequence, &request );
+
+    chameleon_sequence_wait( chamctxt, sequence );
+
+    /* Cleanup the temporary data */
+    chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
+
+    status = sequence->status;
+    chameleon_sequence_destroy( chamctxt, sequence );
+    return status;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile
+ *
+ *  CHAMELEON_zgram_Tile - Tile equivalent of CHAMELEON_zgram().
+ *  Operates on matrices stored by tiles.
+ *  All matrices are passed through descriptors.
+ *  All dimensions are taken from the descriptors.
+ *
+ *******************************************************************************
+ *
+ * @param[in] A
+ *          The N-by-N matrix A.
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zgram
+ * @sa CHAMELEON_zgram_Tile_Async
+ * @sa CHAMELEON_sgram_Tile
+ *
+ */
+int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_sequence_t *sequence = NULL;
+    RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    int status;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    chameleon_sequence_create( chamctxt, &sequence );
+
+    CHAMELEON_zgram_Tile_Async( uplo, A, sequence, &request );
+
+    CHAMELEON_Desc_Flush( A, sequence );
+
+    chameleon_sequence_wait( chamctxt, sequence );
+    status = sequence->status;
+    chameleon_sequence_destroy( chamctxt, sequence );
+    return status;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t_Tile_Async
+ *
+ *  CHAMELEON_zgram_Tile_Async - Non-blocking equivalent of CHAMELEON_zgram_Tile().
+ *  May return before the computation is finished.
+ *  Allows for pipelining of operations at runtime.
+ *
+ *******************************************************************************
+ *
+ * @param[in] sequence
+ *          Identifies the sequence of function calls that this call belongs to
+ *          (for completion checks and exception handling purposes).
+ *
+ * @param[out] request
+ *          Identifies this function call (for exception handling purposes).
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zgram
+ * @sa CHAMELEON_zgram_Tile
+ * @sa CHAMELEON_sgram_Tile_Async
+ *
+ */
+int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A,
+                                RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+
+    chamctxt = chameleon_context_self();
+    if (chamctxt == NULL) {
+        chameleon_fatal_error("CHAMELEON_zgram_Tile", "CHAMELEON not initialized");
+        return CHAMELEON_ERR_NOT_INITIALIZED;
+    }
+    if (sequence == NULL) {
+        chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL sequence");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    if (request == NULL) {
+        chameleon_fatal_error("CHAMELEON_zgram_Tile", "NULL request");
+        return CHAMELEON_ERR_UNALLOCATED;
+    }
+    /* Check sequence status */
+    if (sequence->status == CHAMELEON_SUCCESS) {
+        request->status = CHAMELEON_SUCCESS;
+    }
+    else {
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_SEQUENCE_FLUSHED);
+    }
+
+    /* Check descriptors for correctness */
+    if (chameleon_desc_check(A) != CHAMELEON_SUCCESS) {
+        chameleon_error("CHAMELEON_zgram_Tile", "invalid descriptor");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+    /* Check input arguments */
+    if (A->nb != A->mb) {
+        chameleon_error("CHAMELEON_zgram_Tile", "only square tiles supported");
+        return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE);
+    }
+    /* Quick return */
+    if (chameleon_min(A->m, A->n) == 0) {
+        return CHAMELEON_SUCCESS;
+    }
+
+    chameleon_pzgram( uplo, A, sequence, request );
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/control/compute_z.h b/control/compute_z.h
index e34ec4dfc7c4b37c72a6ad267c0ee09fec9da19c..196fde2047bf7ac40676bd383f63c8f9010405e3 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -115,7 +115,10 @@ void chameleon_pzungqr_param( int genD, int K, const libhqr_tree_t *qrtree,
                               CHAM_desc_t *TS, CHAM_desc_t *TT, CHAM_desc_t *D,
                               RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 
-
+/**
+ * Gram function prototypes
+ */
+void chameleon_pzgram( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 
 /**
  *  LAPACK/Tile Descriptor accesses
diff --git a/coreblas/compute/CMakeLists.txt b/coreblas/compute/CMakeLists.txt
index d44a83fd7a2480d0c5197f4b4d4477aebfa4be16..283659eed45ee323aed7c10ff5479c257c442dc9 100644
--- a/coreblas/compute/CMakeLists.txt
+++ b/coreblas/compute/CMakeLists.txt
@@ -44,6 +44,7 @@ set(ZSRC
     core_zgetrf.c
     core_zgetrf_incpiv.c
     core_zgetrf_nopiv.c
+    core_zgram.c
     core_zhe2ge.c
     core_zherfb.c
     core_zhemm.c
diff --git a/coreblas/compute/core_zgessq.c b/coreblas/compute/core_zgessq.c
index 71aa407bdebde121a438604a3d08a9145b802e2f..8b02bb10ae9a69b8e5742f176a26dd2ea9142d1f 100644
--- a/coreblas/compute/core_zgessq.c
+++ b/coreblas/compute/core_zgessq.c
@@ -174,6 +174,24 @@ int CORE_zgessq( cham_store_t storev, int M, int N,
                  const CHAMELEON_Complex64_t *A, int LDA,
                  double *sclssq )
 {
+    int i;
+    int K;
+
+    /* initialize pairs scale, sumsquare if not already done */
+    if ( storev == ChamColumnwise ) {
+        K = N;
+    } else if ( storev == ChamRowwise ) {
+        K = M;
+    } else {
+        K = 1;
+    }
+    for (i=0; i<2*K; i+=2) {
+        if ( ( sclssq[i] == -1. ) && ( sclssq[i+1] == -1. ) ) {
+            sclssq[i] = 1.;
+            sclssq[i+1] = 0.;
+        }
+    }
+
     if (storev == ChamColumnwise) {
         CORE_zgessq_col( M, N, A, LDA, sclssq );
     }
diff --git a/coreblas/compute/core_zgram.c b/coreblas/compute/core_zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..56881ebde58a0514503caacaf74394eb0080c1e0
--- /dev/null
+++ b/coreblas/compute/core_zgram.c
@@ -0,0 +1,141 @@
+/**
+ *
+ * @file core_zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon core_zgram CPU kernel
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-10
+ * @precisions normal z -> s d c z
+ *
+ */
+#include "coreblas.h"
+
+/**
+ *******************************************************************************
+ *
+ * @ingroup CORE_CHAMELEON_Complex64_t
+ *
+ *  CORE_zgram computes the Gram matrix factors of A inplace
+ *
+ *    \f[
+ *    d_{i.}^2 = (1/n) \sum_j d_{ij}^2` and :math:`d_{..}^2 = (1/n^2) \sum_{i,j} d_{ij}^2 \\
+ *    A_{i,j} = -(1/2) (d_{ij}^2 - d_{i.}^2 - d_{.j}^2 + d_{..}^2)
+ *    \f]
+ *
+ *******************************************************************************
+ *
+ * @param[in] uplo
+ *          Specifies the part of the matrix A to be computed.
+ *            = ChamUpperLower: All the matrix A
+ *            = ChamUpper: Upper triangular part
+ *            = ChamLower: Lower triangular part
+ *
+ * @param[in] M
+ *          The number of rows of the overall matrix.  M >= 0.
+ *
+ * @param[in] N
+ *         The number of columns of the overall matrix.  N >= 0.
+ *
+ * @param[in] Mt
+ *          The number of rows of the tile A.  Mt >= 0.
+ *
+ * @param[in] Nt
+ *         The number of columns of the tile A.  Nt >= 0.
+ *
+ * @param[in] Di
+ *         The 1-by-Mt tile containing the sum of squares by rows.
+ *
+ * @param[in] LDDI
+ *         The leading dimension of the array Di.
+ *
+ * @param[in] Dj
+ *         The 1-by-Nt tile containing the sum of squares by columns.
+ *
+ * @param[in] LDDJ
+ *         The leading dimension of the array Dj.
+ *
+ * @param[in] D
+ *         The sum of squares of all the matrix.
+ *
+ * @param[in,out] A
+ *         On entry, the M-by-N tile A.
+ *         On exit, updated by the application of L.
+ *
+ * @param[in] LDA
+ *         The leading dimension of the array A.  LDA >= max(1,M).
+ *
+ *******************************************************************************
+ *
+ * @retval CHAMELEON_SUCCESS successful exit
+ * @retval <0 if INFO = -k, the k-th argument had an illegal value
+ *
+ */
+
+int CORE_zgram( cham_uplo_t uplo,
+                int M, int N,
+                int Mt, int Nt,
+                const double *Di, int LDDI,
+                const double *Dj, int LDDJ,
+                const double *D,
+                double *A, int LDA )
+{
+    int i, j;
+    double coef = -0.5;
+    double di, dj, d;
+
+    /* Check input arguments */
+    if (uplo != ChamUpper && uplo != ChamLower && uplo != ChamUpperLower ) {
+        coreblas_error(1, "Illegal value of uplo");
+        return -1;
+    }
+    if (M < 0) {
+        coreblas_error(2, "Illegal value of M");
+        return -2;
+    }
+    if (N < 0) {
+        coreblas_error(3, "Illegal value of N");
+        return -3;
+    }
+    if (Mt < 0) {
+        coreblas_error(4, "Illegal value of Mt");
+        return -4;
+    }
+    if (Nt < 0) {
+        coreblas_error(5, "Illegal value of Nt");
+        return -5;
+    }
+    if ((LDA < chameleon_max(1,Mt)) && (Mt > 0)) {
+        coreblas_error(12, "Illegal value of LDA");
+        return -12;
+    }
+
+    /* Quick return */
+    if ((Mt == 0) || (Nt == 0))
+        return CHAMELEON_SUCCESS;
+
+    /* overall mean of squares */
+    d = D[0]*D[0]/(M*N);
+
+    for(j = 0; j < Nt; j++) {
+        /* mean of squares of the column */
+        dj = Dj[j*LDDJ]*Dj[j*LDDJ]/M;
+        int mmin = ( uplo == ChamLower ) ? j : 0;
+        int mmax = ( uplo == ChamUpper)  ? chameleon_min(j+1, Mt) : Mt;
+        for(i = mmin; i < mmax; i++) {
+            /* mean of squares of the row */
+            di = Di[i*LDDI]*Di[i*LDDI]/N;
+            /* compute Gram factor */
+            A[j*LDA+i] = coef*(A[j*LDA+i]*A[j*LDA+i] - di - dj + d);
+        }
+    }
+
+    return CHAMELEON_SUCCESS;
+}
diff --git a/coreblas/compute/core_zplssq.c b/coreblas/compute/core_zplssq.c
index 9549453dcd50df72fc2780eadc80d81ceeac9a83..4e8d8750d4dd58457cf72004e82860b3821d82a1 100644
--- a/coreblas/compute/core_zplssq.c
+++ b/coreblas/compute/core_zplssq.c
@@ -41,7 +41,7 @@ CORE_zplssq_col( int M, int N,
 
     for(j=0; j<N; j++) {
         for(i=0; i<M; i++) {
-            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
             tmpScaleIn+=2;
             tmpSumsqIn+=2;
         }
@@ -68,7 +68,7 @@ CORE_zplssq_row( int M, int N,
         tmpScaleOut = sclssqout;
         tmpSumsqOut = sclssqout+1;
         for(i=0; i<M; i++) {
-            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
             tmpScaleIn+=2;
             tmpSumsqIn+=2;
             tmpScaleOut+=2;
@@ -95,7 +95,7 @@ CORE_zplssq_elt( int M, int N,
 
     for(j=0; j<N; j++) {
         for(i=0; i<M; i++) {
-            sumsq_update_2( 1., tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
+            sumsq_update_2( tmpScaleIn, tmpSumsqIn, tmpScaleOut, tmpSumsqOut );
             tmpScaleIn+=2;
             tmpSumsqIn+=2;
         }
@@ -147,6 +147,24 @@ int CORE_zplssq( cham_store_t storev, int M, int N,
                  double *sclssqin,
                  double *sclssqout )
 {
+    int i;
+    int K;
+
+    /* initialize pairs scale, sumsquare if not already done */
+    if ( storev == ChamColumnwise ) {
+        K = N;
+    } else if ( storev == ChamRowwise ) {
+        K = M;
+    } else {
+        K = 1;
+    }
+    for (i=0; i<2*K; i+=2) {
+        if ( ( sclssqout[i] == -1. ) && ( sclssqout[i+1] == -1. ) ) {
+            sclssqout[i] = 1.;
+            sclssqout[i+1] = 0.;
+        }
+    }
+
     if (storev == ChamColumnwise) {
         CORE_zplssq_col( M, N, sclssqin, sclssqout );
     }
@@ -173,7 +191,7 @@ int CORE_zplssq( cham_store_t storev, int M, int N,
  *  @param[in,out] sclssq
  *          The 2*N matrix on which to compute scl*sqrt(ssq)
  *          On entry contains all couple (scl,ssq) in (sclssq[i],sclssq[i+1])
- *          On exist returns sclssq[i]=sclssq[i]*sqrt(sclssq[i+1])
+ *          On exist returns scl*sqrt(ssq) stored in sclssq[2*i], i = 0, ..., N-1
  *          so that the result is stored in the first line.
  *
  *******************************************************************************
@@ -186,8 +204,8 @@ int CORE_zplssq2( int N,
                   double *sclssq )
 {
     int i;
-    for (i=0; i<N; i+=2) {
-        sclssq[i] *= sqrt ( sclssq[i+1] );
+    for (i=0; i<2*N; i+=2) {
+        sclssq[i] = sclssq[i]*sqrt(sclssq[i+1]);
     }
     return CHAMELEON_SUCCESS;
 }
diff --git a/coreblas/compute/core_zsyssq.c b/coreblas/compute/core_zsyssq.c
index dfad10e3e6d90b61e737c6a283452c791c37bea6..0f194cfeb380390ee2aba5d19e8bbd77405aa560 100644
--- a/coreblas/compute/core_zsyssq.c
+++ b/coreblas/compute/core_zsyssq.c
@@ -244,6 +244,20 @@ int CORE_zsyssq( cham_store_t storev, cham_uplo_t uplo, int N,
                  const CHAMELEON_Complex64_t *A, int LDA,
                  double *sclssq )
 {
+    int i;
+    int K = N;
+
+    /* Initialize pairs scale, sumsquare if not already done */
+    if ( storev == ChamEltwise ) {
+        K = 1;
+    }
+    for (i=0; i<2*K; i+=2) {
+        if ( ( sclssq[i] == -1. ) && ( sclssq[i+1] == -1. ) ) {
+            sclssq[i] = 1.;
+            sclssq[i+1] = 0.;
+        }
+    }
+
     if ( uplo == ChamUpper ) {
         if ( storev == ChamEltwise ) {
             CORE_zsyssq_up_elt( N, A, LDA, sclssq );
diff --git a/coreblas/include/coreblas/coreblas_z.h b/coreblas/include/coreblas/coreblas_z.h
index 2bd8c01067594c0c85ee2d43aeb6432d5b9ba031..ee9e3e0d88269259812b5b97f2c5f8a4c9d40972 100644
--- a/coreblas/include/coreblas/coreblas_z.h
+++ b/coreblas/include/coreblas/coreblas_z.h
@@ -360,4 +360,13 @@ int  CORE_zunmqr(cham_side_t side, cham_trans_t trans,
                  CHAMELEON_Complex64_t *C, int LDC,
                  CHAMELEON_Complex64_t *WORK, int LDWORK);
 
+/**
+ * Gram prototypes
+ */
+int CORE_zgram( cham_uplo_t uplo, int M, int N, int Mt, int Nt,
+                const double *Di, int LDDI,
+                const double *Dj, int LDDJ,
+                const double *D,
+                double *A, int LDA );
+
 #endif /* _coreblas_z_h_ */
diff --git a/coreblas/include/coreblas/sumsq_update.h b/coreblas/include/coreblas/sumsq_update.h
index 1c94e85ba13d540c5b72ad3277937829b9375c5b..c5312e9e3a084d94ed589f37b5e8842cf208038b 100644
--- a/coreblas/include/coreblas/sumsq_update.h
+++ b/coreblas/include/coreblas/sumsq_update.h
@@ -79,7 +79,7 @@ sumsq_update( int nb, float *scale, float *sumsq, const float *value )
  *******************************************************************************
  *
  * @brief Update the couple (scale, sumsq) by adding another couple when
- * computing the Froebnius norm.
+ * computing the Froebenius norm.
  *
  * The frobenius norm is equal to scale * sqrt( sumsq ), this method allows to
  * avoid overflow in the sum square computation.
@@ -100,7 +100,7 @@ sumsq_update( int nb, float *scale, float *sumsq, const float *value )
  *******************************************************************************/
 static inline void
 #if defined(PRECISION_d) || defined(PRECISION_z)
-sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout )
+sumsq_update_2( const double *scalein, const double *sumsqin, double *scaleout, double *sumsqout )
 {
     if (*scaleout >= 0.) {
         if ( (*scaleout) < (*scalein) ) {
@@ -114,7 +114,7 @@ sumsq_update_2( int nb, const double *scalein, const double *sumsqin, double *sc
     }
 }
 #elif defined(PRECISION_s) || defined(PRECISION_c)
-sumsq_update_2( int nb, const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout )
+sumsq_update_2( const float *scalein, const float *sumsqin, float *scaleout, float *sumsqout )
 {
     if (*scaleout >= 0.) {
         if ( (*scaleout) < (*scalein) ) {
diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h
index ce3070d1c1c989f7dea1a7446e49198b484a3a71..60b04f38b92e36752588a2d30f2d6eeeb91408ca 100644
--- a/include/chameleon/chameleon_z.h
+++ b/include/chameleon/chameleon_z.h
@@ -331,6 +331,13 @@ int CHAMELEON_zbuild(cham_uplo_t uplo, int M, int N, CHAMELEON_Complex64_t *A, i
 int CHAMELEON_zbuild_Tile(cham_uplo_t uplo,  CHAM_desc_t *A, void *user_data, void* user_build_callback );
 int CHAMELEON_zbuild_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, void *user_data, void* user_build_callback, RUNTIME_sequence_t *sequence, RUNTIME_request_t  *request);
 
+/**
+ * Gram function prototypes
+ */
+int CHAMELEON_zgram( cham_uplo_t uplo, int N, CHAMELEON_Complex64_t *A, int LDA );
+int CHAMELEON_zgram_Tile( cham_uplo_t uplo, CHAM_desc_t *A );
+int CHAMELEON_zgram_Tile_Async( cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
+
 END_C_DECLS
 
 #endif /* _chameleon_z_h_ */
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 2d8ec988208595f60f34b9355fa6f192ba89db6b..10b43a5aa05c1eada9bd384d57882c701fdd153e 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -418,4 +418,15 @@ INSERT_TASK_zttmqr( const RUNTIME_option_t *options,
                                 A1, A1m, A1n, lda1, A2, A2m, A2n, lda2 );
 }
 
+/**
+ * Gram prototype
+ */
+void INSERT_TASK_zgram( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo,
+                        int m, int n, int mt, int nt,
+                        const CHAM_desc_t *Di, int Dim, int Din, int lddi,
+                        const CHAM_desc_t *Dj, int Djm, int Djn, int lddj,
+                        const CHAM_desc_t *D, int Dm, int Dn,
+                        CHAM_desc_t *A, int Am, int An, int lda);
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index e184ca5362b0f2b96519b20d6211fbb0b17f6d19..879d3cee22e8efa6d7cb3d17b4f8a31a3e85adad 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -94,6 +94,10 @@ set(CODELETS_ZSRC
     # BUILD
     ##################
     codelets/codelet_zbuild.c
+    ##################
+    # GRAM
+    ##################
+    codelets/codelet_zgram.c
     )
 
 set(CODELETS_SRC
diff --git a/runtime/openmp/codelets/codelet_zgram.c b/runtime/openmp/codelets/codelet_zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..1250573332538d53bf201c4b82fb7e57ec2aa1c3
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zgram.c
@@ -0,0 +1,43 @@
+/**
+ *
+ * @file openmp/codelet_zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram OpenMP codelet
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-10
+ * @precisions normal z -> s d c z
+ *
+ */
+
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+
+void INSERT_TASK_zgram( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo,
+                        int m, int n, int mt, int nt,
+                        const CHAM_desc_t *Di, int Dim, int Din, int lddi,
+                        const CHAM_desc_t *Dj, int Djm, int Djn, int lddj,
+                        const CHAM_desc_t *D, int Dm, int Dn,
+                        CHAM_desc_t *A, int Am, int An, int lda)
+{
+    double *ptrDi = RTBLKADDR(Di, double, Dim, Din);
+    double *ptrDj = RTBLKADDR(Dj, double, Djm, Djn);
+    double *ptrD  = RTBLKADDR(D,  double, Dm, Dn);
+    double *ptrA  = RTBLKADDR(A,  double, Am, An);
+
+#pragma omp task firstprivate(uplo, m, n, mt, nt, ptrDi, lddi, ptrDj, lddj, ptrD, ptrA, lda) depend(in:ptrDi[0], ptrDj[0], ptrD[0]) depend(inout:ptrA[0])
+    CORE_zgram( uplo,
+                m, n, mt, nt,
+                ptrDi, lddi,
+                ptrDj, lddj,
+                ptrD,
+                ptrA, lda);
+}
diff --git a/runtime/openmp/codelets/codelet_zplssq.c b/runtime/openmp/codelets/codelet_zplssq.c
index 61196102c2d1ba7646925bb4ff27c2e858e08dd2..ad59a3eb6955f386297753e19bc79a9a12f32bc0 100644
--- a/runtime/openmp/codelets/codelet_zplssq.c
+++ b/runtime/openmp/codelets/codelet_zplssq.c
@@ -39,7 +39,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
 void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
-    CHAMELEON_Complex64_t *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn);
+    double *res = RTBLKADDR(RESULT, double, RESULTm, RESULTn);
 
 #pragma omp task firstprivate(N) depend(inout: res[0])
     CORE_zplssq2(N, res);
diff --git a/runtime/parsec/codelets/codelet_zgessq.c b/runtime/parsec/codelets/codelet_zgessq.c
index 93b332cbf9117375286077075141c3f4a7f14560..4cd62d842671401caf7a0422df7b1ac1d3801947 100644
--- a/runtime/parsec/codelets/codelet_zgessq.c
+++ b/runtime/parsec/codelets/codelet_zgessq.c
@@ -55,6 +55,6 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
         sizeof(int),          &n,            VALUE,
         PASSED_BY_REF,   RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
         sizeof(int),          &lda,          VALUE,
-        PASSED_BY_REF,   RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), INOUT | AFFINITY,
+        PASSED_BY_REF,   RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
diff --git a/runtime/parsec/codelets/codelet_zgram.c b/runtime/parsec/codelets/codelet_zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..8d0217cc2552e329b51be45577c71d9399170098
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zgram.c
@@ -0,0 +1,131 @@
+/**
+ *
+ * @file parsec/codelet_zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram PaRSEC codelet
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-11
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_z.h"
+
+/**
+ *
+ * @ingroup INSERT_TASK_Complex64_t
+ *
+ */
+static inline int
+CORE_zgegram_parsec( parsec_execution_stream_t *context,
+                     parsec_task_t             *this_task )
+{
+    cham_uplo_t uplo;
+    int m, n, mt, nt;
+    double *Di;
+    int lddi;
+    double *Dj;
+    int lddj;
+    double *D;
+    double *A;
+    int lda;
+
+    parsec_dtd_unpack_args(
+        this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &Dj, &lddj, &D, &A, &lda );
+
+    CORE_zgram( uplo,
+                m, n, mt, nt,
+                Di, lddi,
+                Dj, lddj,
+                D,
+                A, lda);
+
+    (void)context;
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+static inline int
+CORE_zsygram_parsec( parsec_execution_stream_t *context,
+                     parsec_task_t             *this_task )
+{
+    cham_uplo_t uplo;
+    int m, n, mt, nt;
+    double *Di;
+    int lddi;
+    double *D;
+    double *A;
+    int lda;
+
+    parsec_dtd_unpack_args(
+        this_task, &uplo, &m, &n, &mt, &nt, &Di, &lddi, &D, &A, &lda );
+
+    CORE_zgram( uplo,
+                m, n, mt, nt,
+                Di, lddi,
+                Di, lddi,
+                D,
+                A, lda);
+
+    (void)context;
+    return PARSEC_HOOK_RETURN_DONE;
+}
+
+void INSERT_TASK_zgram( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo,
+                        int m, int n, int mt, int nt,
+                        const CHAM_desc_t *Di, int Dim, int Din, int lddi,
+                        const CHAM_desc_t *Dj, int Djm, int Djn, int lddj,
+                        const CHAM_desc_t *D, int Dm, int Dn,
+                        CHAM_desc_t *A, int Am, int An, int lda)
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    double *ptrDi, *ptrDj;
+
+    /*
+     * Test if Di is Dj, when we are on the diagonal.
+     * This to avoid having the same data twice in inputs (not handled in parsec).
+     */
+    ptrDi = (double *)(RTBLKADDR( Di, double, Dim, Din ));
+    ptrDj = (double *)(RTBLKADDR( Dj, double, Djm, Djn ));
+    if (ptrDi == ptrDj) {
+        parsec_dtd_taskpool_insert_task(
+            PARSEC_dtd_taskpool, CORE_zsygram_parsec, options->priority, "sygram",
+            sizeof(int),   &uplo, VALUE,
+            sizeof(int),   &m,    VALUE,
+            sizeof(int),   &n,    VALUE,
+            sizeof(int),   &mt,   VALUE,
+            sizeof(int),   &nt,   VALUE,
+            PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT,
+            sizeof(int),   &lddi, VALUE,
+            PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT,
+            PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY,
+            sizeof(int),   &lda,  VALUE,
+            PARSEC_DTD_ARG_END );
+    } else {
+        parsec_dtd_taskpool_insert_task(
+            PARSEC_dtd_taskpool, CORE_zgegram_parsec, options->priority, "gegram",
+            sizeof(int),   &uplo, VALUE,
+            sizeof(int),   &m,    VALUE,
+            sizeof(int),   &n,    VALUE,
+            sizeof(int),   &mt,   VALUE,
+            sizeof(int),   &nt,   VALUE,
+            PASSED_BY_REF, RTBLKADDR( Di, double, Dim, Din ), chameleon_parsec_get_arena_index( Di ) | INPUT,
+            sizeof(int),   &lddi, VALUE,
+            PASSED_BY_REF, RTBLKADDR( Dj, double, Djm, Djn ), chameleon_parsec_get_arena_index( Dj ) | INPUT,
+            sizeof(int),   &lddj, VALUE,
+            PASSED_BY_REF, RTBLKADDR( D, double, Dm, Dn ), chameleon_parsec_get_arena_index( D ) | INPUT,
+            PASSED_BY_REF, RTBLKADDR( A, double, Am, An ), chameleon_parsec_get_arena_index( A ) | INOUT | AFFINITY,
+            sizeof(int),   &lda,  VALUE,
+            PARSEC_DTD_ARG_END );
+    }
+
+}
diff --git a/runtime/parsec/codelets/codelet_zplssq.c b/runtime/parsec/codelets/codelet_zplssq.c
index 16f19a21fad3581ec4a170d7fae3a90b657d5350..96ef96dd5fbf51898396e9d79f7d64f32226b8bb 100644
--- a/runtime/parsec/codelets/codelet_zplssq.c
+++ b/runtime/parsec/codelets/codelet_zplssq.c
@@ -53,8 +53,8 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
         sizeof(int),           &storev,                           VALUE,
         sizeof(int),           &M,                                VALUE,
         sizeof(int),           &N,                                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ),    INPUT,
-        PASSED_BY_REF,         RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ),                INOUT | AFFINITY,
+        PASSED_BY_REF,         RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ) | INPUT,
+        PASSED_BY_REF,         RTBLKADDR( SCLSSQ, double, SCLSSQm, SCLSSQn ), chameleon_parsec_get_arena_index( SCLSSQ) | INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
 
@@ -82,6 +82,6 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
     parsec_dtd_taskpool_insert_task(
         PARSEC_dtd_taskpool, CORE_zplssq2_parsec, options->priority, "plssq2",
         sizeof(int),           &N,                                VALUE,
-        PASSED_BY_REF,         RTBLKADDR( RESULT, double, RESULTm, RESULTn ),     INOUT | AFFINITY,
+        PASSED_BY_REF,         RTBLKADDR( RESULT, double, RESULTm, RESULTn ), chameleon_parsec_get_arena_index( RESULT) | INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
diff --git a/runtime/parsec/codelets/codelet_zsyssq.c b/runtime/parsec/codelets/codelet_zsyssq.c
index 8e86035ccbaafd400985c1ffea92b9e27643e588..d3797ca26c9966bbe8319e1cfd173a430688eaeb 100644
--- a/runtime/parsec/codelets/codelet_zsyssq.c
+++ b/runtime/parsec/codelets/codelet_zsyssq.c
@@ -55,6 +55,6 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
         sizeof(int),            &n,                     VALUE,
         PASSED_BY_REF,          RTBLKADDR( A, CHAMELEON_Complex64_t, Am, An ), chameleon_parsec_get_arena_index( A ) | INPUT,
         sizeof(int),            &lda,                   VALUE,
-        PASSED_BY_REF,          RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ),    INOUT | AFFINITY,
+        PASSED_BY_REF,          RTBLKADDR( SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn ), chameleon_parsec_get_arena_index( SCALESUMSQ ) | INOUT | AFFINITY,
         PARSEC_DTD_ARG_END );
 }
diff --git a/runtime/quark/codelets/codelet_zgessq.c b/runtime/quark/codelets/codelet_zgessq.c
index bf5396e4206dbd5f9389377ac11a16c45b4e24d7..a0f343091a76782f5853b87b73a84da415db1b46 100644
--- a/runtime/quark/codelets/codelet_zgessq.c
+++ b/runtime/quark/codelets/codelet_zgessq.c
@@ -41,13 +41,24 @@ void INSERT_TASK_zgessq( const RUNTIME_option_t *options,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
+    int sizessq;
+
+    if ( storev == ChamColumnwise ) {
+        sizessq = 2*n;
+    } else if ( storev == ChamRowwise ) {
+        sizessq = 2*m;
+    } else {
+        sizessq = 2;
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_GESSQ;
     QUARK_Insert_Task(opt->quark, CORE_zgessq_quark, (Quark_Task_Flags*)opt,
                       sizeof(cham_store_t),            &storev, VALUE,
                       sizeof(int),                     &m,      VALUE,
                       sizeof(int),                     &n,      VALUE,
-                      sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
+                      sizeof(CHAMELEON_Complex64_t)*m*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
                       sizeof(int),                     &lda,    VALUE,
-                      sizeof(double)*2,                RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
+                      sizeof(double)*sizessq,          RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
                       0);
 }
diff --git a/runtime/quark/codelets/codelet_zgram.c b/runtime/quark/codelets/codelet_zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..f2c4228ec1fd7dfbe0535d9fac109e903c12ce0f
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zgram.c
@@ -0,0 +1,68 @@
+/**
+ *
+ * @file quark/codelet_zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram Quark codelet
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-16
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_z.h"
+
+void CORE_zgram_quark(Quark *quark)
+{
+    cham_uplo_t uplo;
+    int m, n, mt, nt;
+    double *Di;
+    int lddi;
+    double *Dj;
+    int lddj;
+    double *D;
+    double *A;
+    int lda;
+
+    quark_unpack_args_12(quark, uplo, m, n, mt, nt, Di, lddi, Dj, lddj, D, A, lda);
+    CORE_zgram( uplo,
+                m, n, mt, nt,
+                Di, lddi,
+                Dj, lddj,
+                D,
+                A, lda);
+}
+
+void INSERT_TASK_zgram( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo,
+                        int m, int n, int mt, int nt,
+                        const CHAM_desc_t *Di, int Dim, int Din, int lddi,
+                        const CHAM_desc_t *Dj, int Djm, int Djn, int lddj,
+                        const CHAM_desc_t *D, int Dm, int Dn,
+                        CHAM_desc_t *A, int Am, int An, int lda)
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_GRAM;
+    QUARK_Insert_Task(opt->quark, CORE_zgram_quark, (Quark_Task_Flags*)opt,
+                      sizeof(int),             &uplo,      VALUE,
+                      sizeof(int),             &m,         VALUE,
+                      sizeof(int),             &n,         VALUE,
+                      sizeof(int),             &mt,        VALUE,
+                      sizeof(int),             &nt,        VALUE,
+                      sizeof(double)*lddi*mt,  RTBLKADDR(Di, double, Dim, Din), INPUT,
+                      sizeof(int),             &lddi,      VALUE,
+                      sizeof(double)*lddj*nt,  RTBLKADDR(Dj, double, Djm, Djn), INPUT,
+                      sizeof(int),             &lddj,      VALUE,
+                      sizeof(double)*2,        RTBLKADDR(D, double, Dm, Dn),  INPUT,
+                      sizeof(double)*mt*nt,    RTBLKADDR(A, double, Am, An),  INOUT,
+                      sizeof(int),             &lda,       VALUE,
+                      0);
+}
diff --git a/runtime/quark/codelets/codelet_zplssq.c b/runtime/quark/codelets/codelet_zplssq.c
index 70810682c9a89cfba122782dbdff65149a17a09e..b340873af51c0e8fae9659def50170eecdb16f83 100644
--- a/runtime/quark/codelets/codelet_zplssq.c
+++ b/runtime/quark/codelets/codelet_zplssq.c
@@ -73,7 +73,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
                          const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn,
                          const CHAM_desc_t *SCLSSQ,     int SCLSSQm,     int SCLSSQn )
 {
-    int sizein = M*N;
+    int sizein = 2*M*N;
     int sizeout;
 
     if ( storev == ChamColumnwise ) {
@@ -85,6 +85,7 @@ void INSERT_TASK_zplssq( const RUNTIME_option_t *options,
     }
 
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_PLSSQ;
     QUARK_Insert_Task(opt->quark, CORE_zplssq_quark, (Quark_Task_Flags*)opt,
         sizeof(int),            &storev,    VALUE,
         sizeof(int),            &M,         VALUE,
@@ -109,6 +110,7 @@ void INSERT_TASK_zplssq2( const RUNTIME_option_t *options, int N,
                           const CHAM_desc_t *RESULT, int RESULTm, int RESULTn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_PLSSQ2;
     QUARK_Insert_Task(opt->quark, CORE_zplssq2_quark, (Quark_Task_Flags*)opt,
         sizeof(int),        &N,         VALUE,
         sizeof(double)*2*N, RTBLKADDR(RESULT, double, RESULTm, RESULTn), INOUT,
diff --git a/runtime/quark/codelets/codelet_zsyssq.c b/runtime/quark/codelets/codelet_zsyssq.c
index cf1f39442c015139d869ae1fc08a5e4a79bd83a9..913424820fd8e01995ef2d889ebd97ac791758cc 100644
--- a/runtime/quark/codelets/codelet_zsyssq.c
+++ b/runtime/quark/codelets/codelet_zsyssq.c
@@ -41,13 +41,22 @@ void INSERT_TASK_zsyssq( const RUNTIME_option_t *options,
                         const CHAM_desc_t *A, int Am, int An, int lda,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
+    int sizessq;
+
+    if ( storev == ChamEltwise ) {
+        sizessq = 2;
+    } else {
+        sizessq = 2*n;
+    }
+
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_SYSSQ;
     QUARK_Insert_Task(opt->quark, CORE_zsyssq_quark, (Quark_Task_Flags*)opt,
         sizeof(cham_store_t),            &storev, VALUE,
         sizeof(int),                     &uplo, VALUE,
         sizeof(int),                     &n,    VALUE,
-        sizeof(CHAMELEON_Complex64_t)*lda*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
+        sizeof(CHAMELEON_Complex64_t)*n*n, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), INPUT,
         sizeof(int),                     &lda,  VALUE,
-        sizeof(double)*2,                RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
+        sizeof(double)*sizessq,          RTBLKADDR(SCALESUMSQ, double, SCALESUMSQm, SCALESUMSQn), INOUT,
         0);
 }
diff --git a/runtime/quark/codelets/codelet_ztrssq.c b/runtime/quark/codelets/codelet_ztrssq.c
index 92cac805f8dd2c791ccafaa1b0c88a6abb7a36fd..80bb9e78c3e41fff60f96fea3d9e9b768a25d65d 100644
--- a/runtime/quark/codelets/codelet_ztrssq.c
+++ b/runtime/quark/codelets/codelet_ztrssq.c
@@ -44,6 +44,7 @@ void INSERT_TASK_ztrssq( const RUNTIME_option_t *options,
                         const CHAM_desc_t *SCALESUMSQ, int SCALESUMSQm, int SCALESUMSQn )
 {
     quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_TRSSQ;
     QUARK_Insert_Task(opt->quark, CORE_ztrssq_quark, (Quark_Task_Flags*)opt,
         sizeof(int),              &uplo, VALUE,
         sizeof(int),              &diag, VALUE,
diff --git a/runtime/quark/include/core_blas_dag.h b/runtime/quark/include/core_blas_dag.h
index 83bdbd531b4161656c30388adc08f4f1ba883e9c..acc7ca8a120651d38deece54c9e3ebae9b9a855f 100644
--- a/runtime/quark/include/core_blas_dag.h
+++ b/runtime/quark/include/core_blas_dag.h
@@ -90,4 +90,12 @@
 #define DAG_CORE_TTMQR      DAG_CORE_TPMQRT
 #define DAG_CORE_TTQRT      DAG_CORE_TPQRT
 
+#define DAG_CORE_GESSQ      DAG_SET_PROPERTIES( "GESSQ"    , "white"    )
+#define DAG_CORE_HESSQ      DAG_SET_PROPERTIES( "HESSQ"    , "white"    )
+#define DAG_CORE_SYSSQ      DAG_SET_PROPERTIES( "SYSSQ"    , "white"    )
+#define DAG_CORE_TRSSQ      DAG_SET_PROPERTIES( "TRSSQ"    , "white"    )
+#define DAG_CORE_PLSSQ      DAG_SET_PROPERTIES( "PLSSQ"    , "white"    )
+#define DAG_CORE_PLSSQ2     DAG_SET_PROPERTIES( "PLSSQ2"   , "white"    )
+#define DAG_CORE_GRAM       DAG_SET_PROPERTIES( "GRAM"     , "orange"   )
+
 #endif /* _core_blas_dag_h_ */
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index dddcb6994da86b889f7d484828d86a9d71cc9814..44d431b7dafc684188f869aa560e0dabdb1f311a 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -22,7 +22,7 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-CHAMELEON_CL_CB(dzasum,         starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      M*N)
+CHAMELEON_CL_CB(dzasum,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zaxpy,         starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[1]), 0,                                      M)
 CHAMELEON_CL_CB(zgeadd,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zlascal,       starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                      M*N)
@@ -34,6 +34,7 @@ CHAMELEON_CL_CB(zgessq,        starpu_matrix_get_nx(task->handles[0]), starpu_ma
 CHAMELEON_CL_CB(zgetrf,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K)
 CHAMELEON_CL_CB(zgetrf_incpiv, starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K)
 CHAMELEON_CL_CB(zgetrf_nopiv,  starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_nx(task->handles[0]), (2./3.)*M*N*K)
+CHAMELEON_CL_CB(zgram,         starpu_matrix_get_nx(task->handles[3]), starpu_matrix_get_ny(task->handles[3]), 0,                                                M*N)
 CHAMELEON_CL_CB(zhe2ge,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                       (1./2.0)*M*N)
 CHAMELEON_CL_CB(zherfb,        starpu_matrix_get_nx(task->handles[0]), starpu_matrix_get_ny(task->handles[0]), 0,                                         2. *M* M*M)
 #if defined(PRECISION_z) || defined(PRECISION_c)
diff --git a/runtime/starpu/codelets/codelet_zgram.c b/runtime/starpu/codelets/codelet_zgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..56af2b407d6bef0b9d2834ff6e8632c497d991c4
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgram.c
@@ -0,0 +1,92 @@
+/**
+ *
+ * @file starpu/codelet_zgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgram StarPU codelet
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-16
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgram_cpu_func(void *descr[], void *cl_arg)
+{
+    cham_uplo_t uplo;
+    int m, n, mt, nt;
+    double *Di;
+    int lddi;
+    double *Dj;
+    int lddj;
+    double *D;
+    double *A;
+    int lda;
+
+    Di = (double *)STARPU_MATRIX_GET_PTR(descr[0]);
+    Dj = (double *)STARPU_MATRIX_GET_PTR(descr[1]);
+    D  = (double *)STARPU_MATRIX_GET_PTR(descr[2]);
+    A  = (double *)STARPU_MATRIX_GET_PTR(descr[3]);
+    starpu_codelet_unpack_args(cl_arg, &uplo, &m, &n, &mt, &nt, &lddi, &lddj, &lda);
+    CORE_zgram( uplo,
+                m, n, mt, nt,
+                Di, lddi,
+                Dj, lddj,
+                D,
+                A, lda);
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU(zgram, 4, cl_zgram_cpu_func)
+
+void INSERT_TASK_zgram( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo,
+                        int m, int n, int mt, int nt,
+                        const CHAM_desc_t *Di, int Dim, int Din, int lddi,
+                        const CHAM_desc_t *Dj, int Djm, int Djn, int lddj,
+                        const CHAM_desc_t *D, int Dm, int Dn,
+                        CHAM_desc_t *A, int Am, int An, int lda)
+{
+  struct starpu_codelet *codelet = &cl_zgram;
+  void (*callback)(void*) = options->profiling ? cl_zgram_callback : NULL;
+
+  CHAMELEON_BEGIN_ACCESS_DECLARATION;
+  CHAMELEON_ACCESS_R(Di, Dim, Din);
+  CHAMELEON_ACCESS_R(Dj, Djm, Djn);
+  CHAMELEON_ACCESS_R(D, Dm, Dn);
+  CHAMELEON_ACCESS_RW(A, Am, An);
+  CHAMELEON_END_ACCESS_DECLARATION;
+
+  starpu_insert_task(
+        starpu_mpi_codelet(codelet),
+        STARPU_VALUE,    &uplo,                      sizeof(int),
+        STARPU_VALUE,    &m,                         sizeof(int),
+        STARPU_VALUE,    &n,                         sizeof(int),
+        STARPU_VALUE,    &mt,                        sizeof(int),
+        STARPU_VALUE,    &nt,                        sizeof(int),
+        STARPU_R,        RTBLKADDR(Di, double, Dim, Din),
+        STARPU_VALUE,    &lddi,                      sizeof(int),
+        STARPU_R,        RTBLKADDR(Dj, double, Djm, Djn),
+        STARPU_VALUE,    &lddj,                      sizeof(int),
+        STARPU_R,        RTBLKADDR(D, double, Dm, Dn),
+        STARPU_RW,       RTBLKADDR(A, double, Am, An),
+        STARPU_VALUE,    &lda,                       sizeof(int),
+        STARPU_PRIORITY, options->priority,
+        STARPU_CALLBACK, callback,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "zgram",
+#endif
+        0);
+}
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 4936d329c834cccb13f33c65df9e367695040728..c92f3a6583c94379f330ea9d6bad365395df2390 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -112,6 +112,7 @@ CODELETS_HEADER(dzasum)
  */
 CODELETS_HEADER(zplrnt)
 CODELETS_HEADER(zbuild)
+CODELETS_HEADER(zgram)
 
 #if defined(PRECISION_z) || defined(PRECISION_c)
 CODELETS_HEADER(zhessq)
diff --git a/testing/CMakeLists.txt b/testing/CMakeLists.txt
index 40b853364f44b0f3b1e8ba56c4024bfffe291714..4ffbae9eabf6633234d2535c5a6f8e01d717dbc6 100644
--- a/testing/CMakeLists.txt
+++ b/testing/CMakeLists.txt
@@ -94,6 +94,7 @@ set(ZSRC
     #testing_zhegv.c
     #testing_zhegvd.c
     testing_zgeqrf_qdwh.c
+    testing_dgram.c
     )
 
 # Add include and link directories
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index 4fc8f3cf22991b7d6f6a705a4f350d2a6a855f51..a5cbc1710b27393d7ff3d7b8e691b7c8a8a48429 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -66,3 +66,11 @@ endforeach()
 #        add_test(test_mpi_${prec}lange mpirun -np 4 ./${prec}testing 1 0 LANGE 600 500 600 --p=2)
 #    endforeach()
 #endif()
+
+# Specific algorithms
+# Gram
+foreach(cat  ${TEST_CATEGORIES})
+  foreach(prec s;d)
+    add_test(test_${cat}_${prec}gram ./${prec}${TEST_CMD_${cat}} GRAM 117 213 )
+  endforeach()
+endforeach()
\ No newline at end of file
diff --git a/testing/testing_dgram.c b/testing/testing_dgram.c
new file mode 100644
index 0000000000000000000000000000000000000000..c9ca38a3f7a3475e257f41d2a0c8343083efabcb
--- /dev/null
+++ b/testing/testing_dgram.c
@@ -0,0 +1,261 @@
+/**
+ *
+ * @file testing_dgram.c
+ *
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon dgram testing
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @author Florent Pruvost
+ * @date 2019-04-12
+ * @precisions normal d -> d s
+ *
+ */
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+#include <chameleon.h>
+#include <coreblas/cblas.h>
+#include <coreblas/lapacke.h>
+#include <coreblas.h>
+#include "testing_zauxiliary.h"
+#if defined(CHAMELEON_USE_MPI)
+#include <mpi.h>
+#endif
+
+static int check_solution(cham_uplo_t uplo,
+                          int N,
+                          double *Aref,
+                          double *Acham, int LDA);
+static int compute_gram_sequential(cham_uplo_t uplo,
+                                   int N,
+                                   double *A,
+                                   int LDA);
+
+int testing_dgram(int argc, char **argv)
+{
+    int hres = 0;
+    /* Check for number of arguments */
+    if ( argc < 2) {
+        USAGE("GRAM", "N LDA",
+              "   - N      : number of rows of matrix A\n"
+              "   - LDA    : leading dimension of matrix A\n");
+        return -1;
+    }
+    int N     = atoi(argv[0]);
+    int LDA   = atoi(argv[1]);
+
+    double eps;
+    int info_solution;
+    int i, j, ua;
+    int LDAxN = LDA*N;
+
+    double *A     = (double *)malloc(LDAxN*sizeof(double));
+    double *Aref  = (double *)malloc(LDAxN*sizeof(double));
+    double *Acham = (double *)malloc(LDAxN*sizeof(double));
+
+    /* Check if unable to allocate memory */
+    if ( (!A) || (!Aref) || (!Acham) )
+    {
+        free(A); free(Aref); free(Acham);
+        printf("Out of Memory \n ");
+        return -2;
+    }
+
+    eps = LAPACKE_dlamch_work('e');
+
+    if (CHAMELEON_My_Mpi_Rank() == 0){
+        printf("\n");
+        printf("------ TESTS FOR CHAMELEON GRAM ROUTINE -------  \n");
+        printf("            Size of the Matrix %d by %d\n", N, N);
+        printf("\n");
+        printf(" The matrix A is randomly generated for each test.\n");
+        printf("============\n");
+        printf(" The relative machine precision (eps) is to be %e \n",eps);
+        printf(" Computational tests pass if scaled residuals are less than 10.\n");
+    }
+
+    /*----------------------------------------------------------
+     *  TESTING GRAM
+     */
+
+    /* Initialize A such that it is symmetric */
+    CHAMELEON_dplgsy( (double)N, ChamUpperLower, N, A, LDA, 51 );
+    /* Gram is meant to be used with A full of positive values only */
+#if defined(PRECISION_d) || defined(PRECISION_s)
+    for (i=0; i<N; i++) {
+        for (j=0; j<N; j++) {
+            if ( A[i+j*LDA] < 0. ) {
+                A[i+j*LDA] = -A[i+j*LDA];
+            }
+        }
+    }
+#endif
+
+    for (ua=0; ua<3; ua++) {
+        CHAMELEON_dlacpy( ChamUpperLower, N, N, A, LDA, Aref, LDA );
+        CHAMELEON_dlacpy( ChamUpperLower, N, N, A, LDA, Acham, LDA );
+
+        /* CHAMELEON GRAM */
+        CHAMELEON_dgram(uplo[ua], N, Acham, LDA);
+
+        /* Check the solution */
+        info_solution = check_solution(uplo[ua], N, Aref, Acham, LDA);
+
+        if (CHAMELEON_My_Mpi_Rank() == 0){
+            if (info_solution == 0) {
+                printf("***************************************************\n");
+                printf(" ---- TESTING GRAM (%s) ............... PASSED !\n", uplostr[ua]);
+                printf("***************************************************\n");
+            }
+            else {
+                printf("************************************************\n");
+                printf(" - TESTING GRAM (%s) ... FAILED !\n", uplostr[ua]);    hres++;
+                printf("************************************************\n");
+            }
+        }
+    }
+    free(A); free(Aref); free(Acham);
+
+    return hres;
+}
+
+/*--------------------------------------------------------------
+ * Check the solution
+ */
+static int check_solution(cham_uplo_t uplo,
+                          int N,
+                          double *Aref,
+                          double *Acham, int LDA)
+{
+    int info_solution;
+    double Arefnorm, Rnorm, result;
+    double eps;
+    double mdone;
+
+    double *work = (double *)malloc(N * sizeof(double));
+
+    mdone = -1.0;
+
+    /*
+     * Compute the Gram matrix sequentially
+     * we consider the matrix on entry as symmetric
+     */
+    compute_gram_sequential(uplo, N, Aref, LDA);
+
+    /* Compute norm of Aref to scale the result norm */
+    if (uplo == ChamUpperLower) {
+        Arefnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, 'I',
+                                       N,  N,  Aref,  LDA, work);
+    } else {
+        Arefnorm = LAPACKE_dlantr_work(LAPACK_COL_MAJOR, 'I',
+                                       chameleon_lapack_const(uplo), chameleon_lapack_const(ChamNonUnit),
+                                       N, N, Aref, LDA, work);
+    }
+    /* compute the difference Aref = Aref - Acham */
+    cblas_daxpy(LDA*N, mdone, Acham, 1, Aref, 1);
+
+    /* compute the norm of the difference */
+    if (uplo == ChamUpperLower) {
+        Rnorm = LAPACKE_dlange_work(LAPACK_COL_MAJOR, 'I',
+                                    N, N, Aref, LDA, work);
+    } else {
+        Rnorm = LAPACKE_dlantr_work(LAPACK_COL_MAJOR, 'I',
+                                    chameleon_lapack_const(uplo), chameleon_lapack_const(ChamNonUnit),
+                                    N, N, Aref, LDA, work);
+    }
+
+    eps = LAPACKE_dlamch_work('e');
+    if (CHAMELEON_My_Mpi_Rank() == 0)
+        printf("Rnorm %e, Anorm %e\n", Rnorm, Arefnorm);
+
+    /* scale the norm in respect to Aref */
+    result = Rnorm / (Arefnorm * N * eps);
+
+    if (CHAMELEON_My_Mpi_Rank() == 0){
+        printf("============\n");
+        printf("Checking the norm of the difference against reference GRAM \n");
+        printf("-- ||Acham - Aref||_oo/((||Aref||_oo.N.eps) = %e \n",
+               result);
+    }
+
+    if (  isnan(Rnorm) || isinf(Rnorm) || isnan(result) || isinf(result) || (result > 10.0) ) {
+         if (CHAMELEON_My_Mpi_Rank() == 0)
+             printf("-- The solution is suspicious ! \n");
+         info_solution = 1;
+    }
+    else {
+         if (CHAMELEON_My_Mpi_Rank() == 0)
+             printf("-- The solution is CORRECT ! \n");
+         info_solution= 0 ;
+    }
+
+    free(work);
+
+    return info_solution;
+}
+
+/*--------------------------------------------------------------
+ * Compute the Gram matrix sequentially
+ * We consider the matrix on entry as symmetric
+ */
+static int compute_gram_sequential(cham_uplo_t uplo,
+                                   int N,
+                                   double *A,
+                                   int LDA)
+{
+    int m, n;
+    double eps;
+    double squareij, mean_dij, mhalf;
+
+    double *work = (double *)malloc(N * sizeof(double));
+
+    mhalf = -0.5;
+
+    /* initialize work */
+    memset(work, 0., N*sizeof(double));
+
+    /* first: compute the means of squares */
+    for (n=0; n<N; n++) {
+        int mmin = ( uplo == ChamLower ) ? n                     : 0;
+        int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, N) : N;
+        for (m = mmin; m < mmax; m++) {
+            squareij = A[m+n*LDA]*A[m+n*LDA];
+            /* accumulate squares on columns */
+            work[n] += squareij;
+            if ( m != n && uplo != ChamUpperLower ) {
+                /* accumulate squares on the symmetric part */
+                work[m] += squareij;
+            }
+        }
+    }
+    mean_dij = 0.;
+    for (n=0; n<N; n++) {
+        /* accumulate the squares over the entire matrix */
+        mean_dij += work[n];
+        /* compute the mean on each column */
+        work[n] /= N;
+    }
+    /* compute the global mean */
+    mean_dij /= N*N;
+    /* second: compute the Gram matrix factors */
+    for (n=0; n<N; n++) {
+        int mmin = ( uplo == ChamLower ) ? n                     : 0;
+        int mmax = ( uplo == ChamUpper ) ? chameleon_min(n+1, N) : N;
+        for (m = mmin; m < mmax; m++) {
+            squareij = A[m+n*LDA]*A[m+n*LDA];
+            A[m+n*LDA] = mhalf*( squareij - work[m] - work[n] + mean_dij );
+        }
+    }
+
+    free(work);
+
+    return 0;
+}
\ No newline at end of file
diff --git a/testing/testing_zauxiliary.c b/testing/testing_zauxiliary.c
index a880d6e7152042fd1d07be5f361befe7b99d4f71..525edc553c5e9247f8a899f5567540df52b0d07d 100644
--- a/testing/testing_zauxiliary.c
+++ b/testing/testing_zauxiliary.c
@@ -35,7 +35,7 @@ int   ISEED[4] = {0,0,0,1};   /* initial seed for zlarnv() */
 
 cham_storage_t  format[6] = { ChamCM, ChamRM, ChamCCRB, ChamCRRB, ChamRCRB, ChamRRRB };
 cham_side_t     side[2]   = { ChamLeft,    ChamRight };
-cham_uplo_t     uplo[2]   = { ChamUpper,   ChamLower };
+cham_uplo_t     uplo[3]   = { ChamUpper,   ChamLower, ChamUpperLower };
 cham_diag_t     diag[2]   = { ChamNonUnit, ChamUnit  };
 cham_trans_t    trans[3]  = { ChamNoTrans, ChamTrans, ChamConjTrans };
 int             itype[3]  = { 1, 2, 3 };
@@ -44,7 +44,7 @@ cham_normtype_t norm[4]   = { ChamMaxNorm, ChamOneNorm, ChamInfNorm, ChamFrobeni
 
 char *formatstr[6]= { "CM", "RM", "CCRB", "CRRB", "RCRB", "RRRB"};
 char *sidestr[2]  = { "Left ", "Right" };
-char *uplostr[2]  = { "Upper", "Lower" };
+char *uplostr[3]  = { "Upper", "Lower", "UpperLower" };
 char *diagstr[2]  = { "NonUnit", "Unit   " };
 char *transstr[3] = { "N", "T", "H" };
 char *itypestr[3]  = { "inv(U')xAxinv(U) or inv(L)xAxinv(L')", "UxAxU' or L'xAxL", "UxAxU' or L'xAxL" };
@@ -312,6 +312,14 @@ int main (int argc, char **argv)
     else if ( strcmp(func, "GEQRF_QDWH") == 0 ) {
         info += testing_zgeqrf_qdwh( argc, argv );
     }
+    /*
+     * Gram Matrix
+     */
+#if defined(PRECISION_d) || defined(PRECISION_s)
+    else if ( strcmp(func, "GRAM") == 0 ) {
+        info += testing_zgram( argc, argv );
+    }
+#endif
     else {
         fprintf(stderr, "Function unknown\n");
     }
diff --git a/testing/testing_zauxiliary.h b/testing/testing_zauxiliary.h
index 30b4bc004f989dfb5441be8483bc18e88d64264a..efc0fa2cc49ff89e47d234b37cd1bddf188e6537 100644
--- a/testing/testing_zauxiliary.h
+++ b/testing/testing_zauxiliary.h
@@ -49,7 +49,7 @@ extern int ISEED[4];
 
 extern cham_storage_t  format[6];
 extern cham_trans_t    trans[3];
-extern cham_uplo_t     uplo[2];
+extern cham_uplo_t     uplo[3];
 extern cham_side_t     side[2];
 extern cham_diag_t     diag[2];
 extern int             itype[3];
@@ -58,7 +58,7 @@ extern cham_normtype_t norm[4];
 
 extern char *formatstr[6];
 extern char *transstr[3];
-extern char *uplostr[2];
+extern char *uplostr[3];
 extern char *sidestr[2];
 extern char *diagstr[2];
 extern char *itypestr[3];
@@ -115,4 +115,6 @@ int testing_zcgesv(int argc, char **argv);
 int testing_zcungesv(int argc, char **argv);
 #endif
 
+int testing_zgram(int argc, char **argv);
+
 #endif /* _testing_zauxiliary_h_ */