diff --git a/compute/pzgemm.c b/compute/pzgemm.c
index e0d51ac963a5078363f6b7386dc06c5e2e221bec..d6d9c7eec81e74a2da0bcd0eea344ccf2dacc98b 100644
--- a/compute/pzgemm.c
+++ b/compute/pzgemm.c
@@ -26,16 +26,176 @@
 #define A(m, n) A,  m,  n
 #define B(m, n) B,  m,  n
 #define C(m, n) C,  m,  n
+#define WA(m, n) &WA,  m,  n
+#define WB(m, n) &WB,  m,  n
+
 /**
- *  Parallel tile matrix-matrix multiplication - dynamic scheduling
+ *  Parallel tile matrix-matrix multiplication
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
  */
-void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
-                         CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                         CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
+static inline void
+chameleon_pzgemm_summa( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
 {
-    CHAM_context_t *chamctxt;
-    RUNTIME_option_t options;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    int m, n, k, p, q, KT, K, lp, lq;
+    int ldam, ldak, ldbn, ldbk, ldcm;
+    int tempmm, tempnn, tempkk;
+    int lookahead, myp, myq;
+
+    CHAMELEON_Complex64_t zbeta;
+    CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    CHAM_desc_t WA, WB;
+
+    lookahead = chamctxt->lookahead;
+    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                         NULL, NULL, NULL );
+    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                         NULL, NULL, NULL );
+
+    KT  = transA == ChamNoTrans ? A->nt : A->mt;
+    K   = transA == ChamNoTrans ? A->n  : A->m;
+    myp = A->myrank / A->q;
+    myq = A->myrank % A->q;
+
+    /*
+     *  A: ChamNoTrans / B: ChamNoTrans
+     */
+    for (k = 0; k < KT; k++ ) {
+        lp = (k % lookahead) * C->p;
+        lq = (k % lookahead) * C->q;
+        tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
+        zbeta = k == 0 ? beta : zone;
+        ldak = BLKLDD(A, k);
+        ldbk = BLKLDD(B, k);
+
+        /* Transfert ownership of the k column of A */
+        for (m = 0; m < C->mt; m ++ ) {
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+            ldam = BLKLDD(A, m);
+
+            if ( transA == ChamNoTrans ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    A(  m,  k ),             ldam,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, A( m, k ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+            else {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempmm, C->mb,
+                    A(  k,  m ),              ldak,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, A( k, m ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempmm, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+        }
+
+        /* Transfert ownership of the k row of B */
+        for (n = 0; n < C->nt; n++) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            ldbn = BLKLDD(B, n);
+
+            if ( transB == ChamNoTrans ) {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    B(   k,              n ), ldbk,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, B( k, n ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+            else {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempnn, tempkk, C->mb,
+                    B(   n,              k ), ldbn,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, B( n, k ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempnn, tempkk, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+        }
+
+        for (m = myp; m < C->mt; m+=C->p) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+            ldcm = BLKLDD(C, m);
+
+            for (n = myq; n < C->nt; n+=C->q) {
+                tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+                INSERT_TASK_zgemm(
+                    options,
+                    transA, transB,
+                    tempmm, tempnn, tempkk, A->mb,
+                    alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
+                           WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
+                    zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+            }
+        }
+    }
+
+    RUNTIME_desc_flush( &WA, sequence );
+    RUNTIME_desc_flush( &WB, sequence );
+    RUNTIME_desc_flush(  C,  sequence );
+    chameleon_sequence_wait( chamctxt, sequence );
+    chameleon_desc_destroy( &WA );
+    chameleon_desc_destroy( &WB );
+}
+
+/**
+ *  Parallel tile matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution.
+ */
+static inline void
+chameleon_pzgemm_generic( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                          RUNTIME_option_t *options )
+{
+    RUNTIME_sequence_t *sequence = options->sequence;
 
     int m, n, k;
     int ldam, ldak, ldbn, ldbk, ldcm;
@@ -44,12 +204,6 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
     CHAMELEON_Complex64_t zbeta;
     CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
 
-    chamctxt = chameleon_context_self();
-    if (sequence->status != CHAMELEON_SUCCESS) {
-        return;
-    }
-    RUNTIME_options_init(&options, chamctxt, sequence, request);
-
     for (m = 0; m < C->mt; m++) {
         tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
         ldcm = BLKLDD(C, m);
@@ -66,11 +220,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
                         ldbk = BLKLDD(B, k);
                         zbeta = k == 0 ? beta : zone;
                         INSERT_TASK_zgemm(
-                            &options,
+                            options,
                             transA, transB,
                             tempmm, tempnn, tempkn, A->mb,
                             alpha, A(m, k), ldam,  /* lda * Z */
-                                   B(k, n), ldbk,  /* ldb * Y */
+                            B(k, n), ldbk,  /* ldb * Y */
                             zbeta, C(m, n), ldcm); /* ldc * Y */
                     }
                 }
@@ -83,11 +237,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
                         tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
                         zbeta = k == 0 ? beta : zone;
                         INSERT_TASK_zgemm(
-                            &options,
+                            options,
                             transA, transB,
                             tempmm, tempnn, tempkn, A->mb,
                             alpha, A(m, k), ldam,  /* lda * Z */
-                                   B(n, k), ldbn,  /* ldb * Z */
+                            B(n, k), ldbn,  /* ldb * Z */
                             zbeta, C(m, n), ldcm); /* ldc * Y */
                     }
                 }
@@ -103,11 +257,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
                         ldbk = BLKLDD(B, k);
                         zbeta = k == 0 ? beta : zone;
                         INSERT_TASK_zgemm(
-                            &options,
+                            options,
                             transA, transB,
                             tempmm, tempnn, tempkm, A->mb,
                             alpha, A(k, m), ldak,  /* lda * X */
-                                   B(k, n), ldbk,  /* ldb * Y */
+                            B(k, n), ldbk,  /* ldb * Y */
                             zbeta, C(m, n), ldcm); /* ldc * Y */
                     }
                 }
@@ -121,11 +275,11 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
                         ldak = BLKLDD(A, k);
                         zbeta = k == 0 ? beta : zone;
                         INSERT_TASK_zgemm(
-                            &options,
+                            options,
                             transA, transB,
                             tempmm, tempnn, tempkm, A->mb,
                             alpha, A(k, m), ldak,  /* lda * X */
-                                   B(n, k), ldbn,  /* ldb * Z */
+                            B(n, k), ldbn,  /* ldb * Z */
                             zbeta, C(m, n), ldcm); /* ldc * Y */
                     }
                 }
@@ -142,5 +296,37 @@ void chameleon_pzgemm(cham_trans_t transA, cham_trans_t transB,
             }
         }
     }
-    RUNTIME_options_finalize(&options, chamctxt);
+
+    (void)chamctxt;
+}
+
+/**
+ *  Parallel tile matrix-matrix multiplication. wrapper.
+ */
+void
+chameleon_pzgemm( cham_trans_t transA, cham_trans_t transB,
+                  CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                  CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                  RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS) {
+        return;
+    }
+    RUNTIME_options_init( &options, chamctxt, sequence, request );
+
+    if ( ((C->p > 1) || (C->q > 1)) &&
+         (C->get_rankof == chameleon_getrankof_2d) &&
+         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
+    {
+        chameleon_pzgemm_summa(   chamctxt, transA, transB, alpha, A, B, beta, C, &options );
+    }
+    else {
+        chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
+    }
+
+    RUNTIME_options_finalize( &options, chamctxt );
 }
diff --git a/compute/pzhemm.c b/compute/pzhemm.c
index 9fcaf6c3797659380fb06ec012db95b618bb4248..93efe6fcaf43be3a57c8c8314a7b9c11f6c7448d 100644
--- a/compute/pzhemm.c
+++ b/compute/pzhemm.c
@@ -23,20 +23,230 @@
  */
 #include "control/common.h"
 
-#define A(m,n) A,  m,  n
-#define B(m,n) B,  m,  n
-#define C(m,n) C,  m,  n
+#define A(m, n) A,  m,  n
+#define B(m, n) B,  m,  n
+#define C(m, n) C,  m,  n
+#define WA(m, n) &WA,  m,  n
+#define WB(m, n) &WB,  m,  n
+
 /**
- *  Parallel tile Hermitian matrix-matrix multiplication - dynamic scheduling
+ *  Parallel tile hermitian matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
  */
-void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
-                         CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                         CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
-                         RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
+static inline void
+chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
 {
-    CHAM_context_t *chamctxt;
-    RUNTIME_option_t options;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    cham_trans_t transA, transB;
+    int Am, An, m, n, k, p, q, KT, K, lp, lq;
+    int ldam, ldbk, ldbm, ldcm;
+    int tempmm, tempnn, tempkk;
+    int lookahead, myp, myq;
+
+    CHAMELEON_Complex64_t zbeta;
+    CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    CHAM_desc_t WA, WB;
+
+    lookahead = chamctxt->lookahead;
+    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                         NULL, NULL, NULL );
+    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                         NULL, NULL, NULL );
+
+    KT  = side == ChamLeft ? A->nt : A->mt;
+    K   = side == ChamLeft ? A->n  : A->m;
+    myp = C->myrank / C->q;
+    myq = C->myrank % C->q;
+
+    for (k = 0; k < KT; k++ ) {
+        lp = (k % lookahead) * C->p;
+        lq = (k % lookahead) * C->q;
+        tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
+        zbeta = k == 0 ? beta : zone;
+        ldbk = BLKLDD(B, k);
+
+        /* Transfert ownership of the k column of A or B */
+        for (m = 0; m < C->mt; m ++ ) {
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+
+            if ( side == ChamLeft ) {
+                if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                     (( uplo == ChamLower ) && ( m < k )) ) {
+                    Am = k;
+                    An = m;
+                }
+                else {
+                    Am = m;
+                    An = k;
+                }
+                ldam = BLKLDD(A, Am);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    A(  Am, An ),             ldam,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, A( Am, An ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+            else {
+                ldbm = BLKLDD(B, m);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    B(  m,  k ),              ldbm,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, B( m, k ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+        }
+
+        /* Transfert ownership of the k row of B, or A */
+        for (n = 0; n < C->nt; n++) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+            if ( side == ChamRight ) {
+                if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                     (( uplo == ChamLower ) && ( n > k )) ) {
+                    Am = n;
+                    An = k;
+                }
+                else {
+                    Am = k;
+                    An = n;
+                }
+                ldam = BLKLDD(A, Am);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    A(   Am,            An ), ldam,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, A( Am, An ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+            else {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    B(   k,              n ), ldbk,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, B( k, n ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+        }
 
+        /*
+         *  ChamLeft / ChamLower
+         */
+        for (m = myp; m < C->mt; m+=C->p) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+            ldcm = BLKLDD(C, m);
+
+            for (n = myq; n < C->nt; n+=C->q) {
+                tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+                if (side == ChamLeft) {
+                    transB = ChamNoTrans;
+                    if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                         (( uplo == ChamLower ) && ( m < k )) ) {
+                        transA = ChamConjTrans;
+                    }
+                    else {
+                        transA = ChamNoTrans;
+                    }
+                }
+                else {
+                    transA = ChamNoTrans;
+                    if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                         (( uplo == ChamLower ) && ( n > k )) ) {
+                        transB = ChamConjTrans;
+                    }
+                    else {
+                        transB = ChamNoTrans;
+                    }
+                }
+
+                if ( k == m ) {
+                    INSERT_TASK_zhemm(
+                        options, side, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
+                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
+                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                }
+                else {
+                    INSERT_TASK_zgemm(
+                        options, transA, transB,
+                        tempmm, tempnn, tempkk, A->mb,
+                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
+                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
+                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                }
+            }
+        }
+    }
+
+    RUNTIME_desc_flush( &WA, sequence );
+    RUNTIME_desc_flush( &WB, sequence );
+    RUNTIME_desc_flush(  C,  sequence );
+    chameleon_sequence_wait( chamctxt, sequence );
+    chameleon_desc_destroy( &WA );
+    chameleon_desc_destroy( &WB );
+}
+
+/**
+ *  Parallel tile hermitian matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution.
+ */
+static inline void
+chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                          RUNTIME_option_t *options )
+{
     int k, m, n;
     int ldam, ldan, ldak, ldbk, ldbm, ldcm;
     int tempmm, tempnn, tempkn, tempkm;
@@ -44,12 +254,6 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
     CHAMELEON_Complex64_t zbeta;
     CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
 
-    chamctxt = chameleon_context_self();
-    if (sequence->status != CHAMELEON_SUCCESS) {
-        return;
-    }
-    RUNTIME_options_init(&options, chamctxt, sequence, request);
-
     for(m = 0; m < C->mt; m++) {
         tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
         ldcm = BLKLDD(C, m);
@@ -68,7 +272,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         zbeta = k == 0 ? beta : zone;
                         if (k < m) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamNoTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkm, A->mb,
                                 alpha, A(m, k), ldam,  /* lda * K */
@@ -78,7 +282,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         else {
                             if (k == m) {
                                 INSERT_TASK_zhemm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
                                     alpha, A(k, k), ldak,  /* ldak * X */
@@ -87,7 +291,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamConjTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkm, A->mb,
                                     alpha, A(k, m), ldak,  /* ldak * X */
@@ -108,7 +312,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         zbeta = k == 0 ? beta : zone;
                         if (k < m) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamConjTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkm, A->mb,
                                 alpha, A(k, m), ldak,  /* ldak * X */
@@ -118,7 +322,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         else {
                             if (k == m) {
                                 INSERT_TASK_zhemm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
                                     alpha, A(k, k), ldak,  /* ldak * K */
@@ -127,7 +331,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkm, A->mb,
                                     alpha, A(m, k), ldam,  /* lda * K */
@@ -151,7 +355,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         zbeta = k == 0 ? beta : zone;
                         if (k < n) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamNoTrans, ChamConjTrans,
                                 tempmm, tempnn, tempkn, A->mb,
                                 alpha, B(m, k), ldbm,  /* ldb * K */
@@ -161,7 +365,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         else {
                             if (k == n) {
                                 INSERT_TASK_zhemm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
                                     alpha, A(k, k), ldak,  /* ldak * Y */
@@ -170,7 +374,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkn, A->mb,
                                     alpha, B(m, k), ldbm,  /* ldb  * K */
@@ -190,7 +394,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         zbeta = k == 0 ? beta : zone;
                         if (k < n) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamNoTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkn, A->mb,
                                 alpha, B(m, k), ldbm,  /* ldb  * K */
@@ -200,7 +404,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                         else {
                             if (k == n) {
                                 INSERT_TASK_zhemm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
                                     alpha, A(k, k), ldak,  /* ldak * Y */
@@ -209,7 +413,7 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamConjTrans,
                                     tempmm, tempnn, tempkn, A->mb,
                                     alpha, B(m, k), ldbm,  /* ldb * K */
@@ -222,5 +426,36 @@ void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo,
             }
         }
     }
-    RUNTIME_options_finalize(&options, chamctxt);
+    (void)chamctxt;
+}
+
+/**
+ *  Parallel tile hermitian matrix-matrix multiplication. wrapper.
+ */
+void
+chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo,
+                  CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                  CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                  RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS) {
+        return;
+    }
+    RUNTIME_options_init( &options, chamctxt, sequence, request );
+
+    if ( ((C->p > 1) || (C->q > 1)) &&
+         (C->get_rankof == chameleon_getrankof_2d) &&
+         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
+    {
+        chameleon_pzhemm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+    }
+    else {
+        chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+    }
+
+    RUNTIME_options_finalize( &options, chamctxt );
 }
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index f7df025e27e6814f9fe11d2fd96b158d1714ff02..0413d63b5f94fe5266ec2140232998360b175de8 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -23,234 +23,439 @@
  */
 #include "control/common.h"
 
-#define A(m,n) A,  m,  n
-#define B(m,n) B,  m,  n
-#define C(m,n) C,  m,  n
+#define A(m, n) A,  m,  n
+#define B(m, n) B,  m,  n
+#define C(m, n) C,  m,  n
+#define WA(m, n) &WA,  m,  n
+#define WB(m, n) &WB,  m,  n
+
 /**
- *  Parallel tile symmetric matrix-matrix multiplication - dynamic scheduling
+ *  Parallel tile symmetric matrix-matrix multiplication.
+ *  SUMMA algorithm for 2D block-cyclic data distribution.
  */
-void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo,
-                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                          CHAMELEON_Complex64_t beta, CHAM_desc_t *C,
-                          RUNTIME_sequence_t *sequence, RUNTIME_request_t *request)
+static inline void
+chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
 {
-    CHAM_context_t *chamctxt;
-    RUNTIME_option_t options;
-
-    int k, m, n;
-    int ldak, ldam, ldan, ldbk, ldbm, ldcm;
-    int tempmm, tempnn, tempkn, tempkm;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    cham_trans_t transA, transB;
+    int Am, An, m, n, k, p, q, KT, K, lp, lq;
+    int ldam, ldbk, ldbm, ldcm;
+    int tempmm, tempnn, tempkk;
+    int lookahead, myp, myq;
 
     CHAMELEON_Complex64_t zbeta;
     CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    CHAM_desc_t WA, WB;
 
-    chamctxt = chameleon_context_self();
-    if (sequence->status != CHAMELEON_SUCCESS) {
-        return;
-    }
-    RUNTIME_options_init(&options, chamctxt, sequence, request);
-
-    /*
-     *  ChamLeft
-     */
-    if (side == ChamLeft) {
-        for (k = 0; k < C->mt; k++) {
-            tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
-            ldak = BLKLDD(A, k);
-            ldbk = BLKLDD(B, k);
-            zbeta = k == 0 ? beta : zone;
-
-            for (n = 0; n < C->nt; n++) {
+    lookahead = chamctxt->lookahead;
+    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                         NULL, NULL, NULL );
+    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
+                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                         NULL, NULL, NULL );
+
+    KT  = side == ChamLeft ? A->nt : A->mt;
+    K   = side == ChamLeft ? A->n  : A->m;
+    myp = C->myrank / C->q;
+    myq = C->myrank % C->q;
+
+    for (k = 0; k < KT; k++ ) {
+        lp = (k % lookahead) * C->p;
+        lq = (k % lookahead) * C->q;
+        tempkk = k == KT - 1 ? K - k * A->nb : A->nb;
+        zbeta = k == 0 ? beta : zone;
+        ldbk = BLKLDD(B, k);
+
+        /* Transfert ownership of the k column of A or B */
+        for (m = 0; m < C->mt; m ++ ) {
+            tempmm = m == C->mt-1 ? C->m - m * C->mb : C->mb;
+
+            if ( side == ChamLeft ) {
+                if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                     (( uplo == ChamLower ) && ( m < k )) ) {
+                    Am = k;
+                    An = m;
+                }
+                else {
+                    Am = m;
+                    An = k;
+                }
+                ldam = BLKLDD(A, Am);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    A(  Am, An ),             ldam,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, A( Am, An ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+            else {
+                ldbm = BLKLDD(B, m);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempmm, tempkk, C->mb,
+                    B(  m,  k ),              ldbm,
+                    WA( m, (k % C->q) + lq ), WA.mb );
+
+                RUNTIME_data_flush( sequence, B( m, k ) );
+
+                for ( q=1; q < C->q; q++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempmm, tempkk, C->mb,
+                        WA( m, ((k+q-1) % C->q) + lq ), WA.mb,
+                        WA( m, ((k+q)   % C->q) + lq ), WA.mb );
+                }
+            }
+        }
+
+        /* Transfert ownership of the k row of B, or A */
+        for (n = 0; n < C->nt; n++) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+
+            if ( side == ChamRight ) {
+                if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                     (( uplo == ChamLower ) && ( n > k )) ) {
+                    Am = n;
+                    An = k;
+                }
+                else {
+                    Am = k;
+                    An = n;
+                }
+                ldam = BLKLDD(A, Am);
+
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    A(   Am,            An ), ldam,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, A( Am, An ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+            else {
+                INSERT_TASK_zlacpy(
+                    options,
+                    ChamUpperLower, tempkk, tempnn, C->mb,
+                    B(   k,              n ), ldbk,
+                    WB( (k % C->p) + lp, n ), WB.mb );
+
+                RUNTIME_data_flush( sequence, B( k, n ) );
+
+                for ( p=1; p < C->p; p++ ) {
+                    INSERT_TASK_zlacpy(
+                        options,
+                        ChamUpperLower, tempkk, tempnn, C->mb,
+                        WB( ((k+p-1) % C->p) + lp, n ), WB.mb,
+                        WB( ((k+p)   % C->p) + lp, n ), WB.mb );
+                }
+            }
+        }
+
+        /*
+         *  ChamLeft / ChamLower
+         */
+        for (m = myp; m < C->mt; m+=C->p) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+            ldcm = BLKLDD(C, m);
+
+            for (n = myq; n < C->nt; n+=C->q) {
                 tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
 
-                for (m = 0; m < C->mt; m++) {
-                    tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
-                    ldam = BLKLDD(A, m);
-                    ldcm = BLKLDD(C, m);
+                if (side == ChamLeft) {
+                    transB = ChamNoTrans;
+                    if ( (( uplo == ChamUpper ) && ( m > k )) ||
+                         (( uplo == ChamLower ) && ( m < k )) ) {
+                        transA = ChamTrans;
+                    }
+                    else {
+                        transA = ChamNoTrans;
+                    }
+                }
+                else {
+                    transA = ChamNoTrans;
+                    if ( (( uplo == ChamUpper ) && ( n < k )) ||
+                         (( uplo == ChamLower ) && ( n > k )) ) {
+                        transB = ChamTrans;
+                    }
+                    else {
+                        transB = ChamNoTrans;
+                    }
+                }
 
-                    /*
-                     *  ChamLeft / ChamLower
-                     */
-                    if (uplo == ChamLower) {
+                if ( k == m ) {
+                    INSERT_TASK_zsymm(
+                        options, side, uplo,
+                        tempmm, tempnn, A->mb,
+                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
+                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
+                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                }
+                else {
+                    INSERT_TASK_zgemm(
+                        options, transA, transB,
+                        tempmm, tempnn, tempkk, A->mb,
+                        alpha, WA( m,        myq + lq ), WA.mb,  /* lda * Z */
+                               WB( myp + lp, n        ), WB.mb,  /* ldb * Y */
+                        zbeta, C(  m,        n        ), ldcm ); /* ldc * Y */
+                }
+            }
+        }
+    }
+
+    RUNTIME_desc_flush( &WA, sequence );
+    RUNTIME_desc_flush( &WB, sequence );
+    RUNTIME_desc_flush(  C,  sequence );
+    chameleon_sequence_wait( chamctxt, sequence );
+    chameleon_desc_destroy( &WA );
+    chameleon_desc_destroy( &WB );
+}
+
+/**
+ *  Parallel tile symmetric matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution.
+ */
+static inline void
+chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                          RUNTIME_option_t *options )
+{
+    int k, m, n;
+    int ldam, ldan, ldak, ldbk, ldbm, ldcm;
+    int tempmm, tempnn, tempkn, tempkm;
+
+    CHAMELEON_Complex64_t zbeta;
+    CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+
+    for(m = 0; m < C->mt; m++) {
+        tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+        ldcm = BLKLDD(C, m);
+        for(n = 0; n < C->nt; n++) {
+            tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+            /*
+             *  ChamLeft / ChamLower
+             */
+            if (side == ChamLeft) {
+                ldam = BLKLDD(A, m);
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+                        ldak = BLKLDD(A, k);
+                        ldbk = BLKLDD(B, k);
+                        zbeta = k == 0 ? beta : zone;
                         if (k < m) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamNoTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkm, A->mb,
-                                alpha, A(m, k), ldam,
-                                       B(k, n), ldbk,
-                                zbeta, C(m, n), ldcm);
+                                alpha, A(m, k), ldam,  /* lda * K */
+                                       B(k, n), ldbk,  /* ldb * Y */
+                                zbeta, C(m, n), ldcm); /* ldc * Y */
                         }
                         else {
                             if (k == m) {
                                 INSERT_TASK_zsymm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
-                                    alpha, A(k, k), ldak,
-                                           B(k, n), ldbk,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, A(k, k), ldak,  /* ldak * X */
+                                           B(k, n), ldbk,  /* ldb  * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkm, A->mb,
-                                    alpha, A(k, m), ldak,
-                                           B(k, n), ldbk,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, A(k, m), ldak,  /* ldak * X */
+                                           B(k, n), ldbk,  /* ldb  * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                         }
                     }
-                    /*
-                     *  ChamLeft / ChamUpper
-                     */
-                    else {
+                }
+                /*
+                 *  ChamLeft / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+                        ldak = BLKLDD(A, k);
+                        ldbk = BLKLDD(B, k);
+                        zbeta = k == 0 ? beta : zone;
                         if (k < m) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkm, A->mb,
-                                alpha, A(k, m), ldak,
-                                       B(k, n), ldbk,
-                                zbeta, C(m, n), ldcm);
+                                alpha, A(k, m), ldak,  /* ldak * X */
+                                       B(k, n), ldbk,  /* ldb  * Y */
+                                zbeta, C(m, n), ldcm); /* ldc  * Y */
                         }
                         else {
                             if (k == m) {
                                 INSERT_TASK_zsymm(
-                                    &options,
+                                    options,
                                     side, uplo,
                                     tempmm, tempnn, A->mb,
-                                    alpha, A(k, k), ldak,
-                                           B(k, n), ldbk,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, A(k, k), ldak,  /* ldak * K */
+                                           B(k, n), ldbk,  /* ldb  * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkm, A->mb,
-                                    alpha, A(m, k), ldam,
-                                           B(k, n), ldbk,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, A(m, k), ldam,  /* lda * K */
+                                           B(k, n), ldbk,  /* ldb * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc * Y */
                             }
                         }
                     }
                 }
-                RUNTIME_data_flush( sequence, B(k, n) );
-            }
-            if (uplo == ChamLower) {
-                for (n = 0; n <= k; n++) {
-                    RUNTIME_data_flush( sequence, A(k, n) );
-                }
             }
+            /*
+             *  ChamRight / ChamLower
+             */
             else {
-                for (m = 0; m <= k; m++) {
-                    RUNTIME_data_flush( sequence, A(m, k) );
-                }
-            }
-        }
-    }
-    /*
-     *  ChamRight
-     */
-    else {
-        for (k = 0; k < C->nt; k++) {
-            tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
-            ldak = BLKLDD(A, k);
-            zbeta = k == 0 ? beta : zone;
-
-            for (m = 0; m < C->mt; m++) {
-                tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+                ldan = BLKLDD(A, n);
                 ldbm = BLKLDD(B, m);
-                ldcm = BLKLDD(C, m);
-
-                for (n = 0; n < C->nt; n++) {
-                    tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
-                    ldan = BLKLDD(A, n);
-
-                    /*
-                     *  ChamRight / ChamLower
-                     */
-                    if (uplo == ChamLower) {
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+                        ldak = BLKLDD(A, k);
+                        zbeta = k == 0 ? beta : zone;
                         if (k < n) {
-                           INSERT_TASK_zgemm(
-                               &options,
-                               ChamNoTrans, ChamTrans,
-                               tempmm, tempnn, tempkn, A->mb,
-                               alpha, B(m, k), ldbm,
-                                      A(n, k), ldan,
-                               zbeta, C(m, n), ldcm);
+                            INSERT_TASK_zgemm(
+                                options,
+                                ChamNoTrans, ChamTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k), ldbm,  /* ldb * K */
+                                       A(n, k), ldan,  /* lda * K */
+                                zbeta, C(m, n), ldcm); /* ldc * Y */
                         }
                         else {
                             if (k == n) {
-                               INSERT_TASK_zsymm(
-                                   &options,
-                                   ChamRight, uplo,
-                                   tempmm, tempnn, A->mb,
-                                   alpha, A(k, k), ldak,
-                                          B(m, k), ldbm,
-                                   zbeta, C(m, n), ldcm);
+                                INSERT_TASK_zsymm(
+                                    options,
+                                    side, uplo,
+                                    tempmm, tempnn, A->mb,
+                                    alpha, A(k, k), ldak,  /* ldak * Y */
+                                           B(m, k), ldbm,  /* ldb  * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamNoTrans,
                                     tempmm, tempnn, tempkn, A->mb,
-                                    alpha, B(m, k), ldbm,
-                                           A(k, n), ldak,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, B(m, k), ldbm,  /* ldb  * K */
+                                           A(k, n), ldak,  /* ldak * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                         }
                     }
-                    /*
-                     *  ChamRight / ChamUpper
-                     */
-                    else {
+                }
+                /*
+                 *  ChamRight / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+                        ldak = BLKLDD(A, k);
+                        zbeta = k == 0 ? beta : zone;
                         if (k < n) {
                             INSERT_TASK_zgemm(
-                                &options,
+                                options,
                                 ChamNoTrans, ChamNoTrans,
                                 tempmm, tempnn, tempkn, A->mb,
-                                alpha, B(m, k), ldbm,
-                                       A(k, n), ldak,
-                                zbeta, C(m, n), ldcm);
+                                alpha, B(m, k), ldbm,  /* ldb  * K */
+                                       A(k, n), ldak,  /* ldak * Y */
+                                zbeta, C(m, n), ldcm); /* ldc  * Y */
                         }
                         else {
                             if (k == n) {
                                 INSERT_TASK_zsymm(
-                                    &options,
-                                    ChamRight, uplo,
+                                    options,
+                                    side, uplo,
                                     tempmm, tempnn, A->mb,
-                                    alpha, A(k, k), ldak,
-                                           B(m, k), ldbm,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, A(k, k), ldak,  /* ldak * Y */
+                                           B(m, k), ldbm,  /* ldb  * Y */
+                                    zbeta, C(m, n), ldcm); /* ldc  * Y */
                             }
                             else {
                                 INSERT_TASK_zgemm(
-                                    &options,
+                                    options,
                                     ChamNoTrans, ChamTrans,
                                     tempmm, tempnn, tempkn, A->mb,
-                                    alpha, B(m, k), ldbm,
-                                           A(n, k), ldan,
-                                    zbeta, C(m, n), ldcm);
+                                    alpha, B(m, k), ldbm,  /* ldb * K */
+                                           A(n, k), ldan,  /* lda * K */
+                                    zbeta, C(m, n), ldcm); /* ldc * Y */
                             }
                         }
                     }
                 }
-                RUNTIME_data_flush( sequence, B(m, k) );
-            }
-            if (uplo == ChamLower) {
-                for (n = 0; n <= k; n++) {
-                    RUNTIME_data_flush( sequence, A(k, n) );
-                }
-            }
-            else {
-                for (m = 0; m <= k; m++) {
-                    RUNTIME_data_flush( sequence, A(m, k) );
-                }
             }
         }
     }
-    RUNTIME_options_finalize(&options, chamctxt);
+    (void)chamctxt;
+}
+
+/**
+ *  Parallel tile symmetric matrix-matrix multiplication. wrapper.
+ */
+void
+chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
+                  CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                  CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                  RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+{
+    CHAM_context_t *chamctxt;
+    RUNTIME_option_t options;
+
+    chamctxt = chameleon_context_self();
+    if (sequence->status != CHAMELEON_SUCCESS) {
+        return;
+    }
+    RUNTIME_options_init( &options, chamctxt, sequence, request );
+
+    if ( ((C->p > 1) || (C->q > 1)) &&
+         (C->get_rankof == chameleon_getrankof_2d) &&
+         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
+    {
+        chameleon_pzsymm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+    }
+    else {
+        chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+    }
+
+    RUNTIME_options_finalize( &options, chamctxt );
 }
diff --git a/control/context.c b/control/context.c
index 306e42215cf7e96f0d7b8755e27b307363dcbdc0..d57b0f1e697f6c423b7146929f01092d32bf0599 100644
--- a/control/context.c
+++ b/control/context.c
@@ -62,6 +62,7 @@ CHAM_context_t *chameleon_context_create()
     chamctxt->nb                 = 128;
     chamctxt->ib                 = 32;
     chamctxt->rhblock            = 4;
+    chamctxt->lookahead          = 3;
 
     chamctxt->nworkers           = 1;
     chamctxt->ncudas             = 0;
@@ -72,11 +73,11 @@ CHAM_context_t *chameleon_context_create()
     chamctxt->parallel_enabled   = CHAMELEON_FALSE;
     chamctxt->profiling_enabled  = CHAMELEON_FALSE;
     chamctxt->progress_enabled   = CHAMELEON_FALSE;
+    chamctxt->generic_enabled    = CHAMELEON_FALSE;
 
     chamctxt->householder        = ChamFlatHouseholder;
     chamctxt->translation        = ChamOutOfPlace;
 
-
     /* Initialize scheduler */
     RUNTIME_context_create(chamctxt);
 
@@ -120,6 +121,7 @@ int chameleon_context_destroy(){
  *          @arg CHAMELEON_PROFILING_MODE  activate profiling of kernels
  *          @arg CHAMELEON_PROGRESS  activate progress indicator
  *          @arg CHAMELEON_GEMM3M  Use z/cgemm3m for complexe matrix-matrix products
+ *          @arg CHAMELEON_GENERIC  enable/disable GEMM3M  Use z/cgemm3m for complexe matrix-matrix products
  *
  *******************************************************************************
  *
@@ -160,6 +162,9 @@ int CHAMELEON_Enable(int option)
         /* case CHAMELEON_PARALLEL: */
         /*     chamctxt->parallel_enabled = CHAMELEON_TRUE; */
         /*     break; */
+        case CHAMELEON_GENERIC:
+            chamctxt->generic_enabled = CHAMELEON_TRUE;
+            break;
         default:
             chameleon_error("CHAMELEON_Enable", "illegal parameter value");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
@@ -225,6 +230,9 @@ int CHAMELEON_Disable(int option)
         case CHAMELEON_PARALLEL_MODE:
             chamctxt->parallel_enabled = CHAMELEON_FALSE;
             break;
+        case CHAMELEON_GENERIC:
+            chamctxt->generic_enabled = CHAMELEON_FALSE;
+            break;
         default:
             chameleon_error("CHAMELEON_Disable", "illegal parameter value");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
@@ -248,6 +256,7 @@ int CHAMELEON_Disable(int option)
  *          Feature to be enabled:
  *          @arg CHAMELEON_TILE_SIZE:        size matrix tile,
  *          @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block,
+ *          @arg CHAMELEON_LOOKAHEAD:        depth of the look ahead in algorithms
  *
  * @param[in] value
  *          Value of the parameter.
@@ -321,6 +330,13 @@ int CHAMELEON_Set( int param, int value )
             }
             chamctxt->translation = value;
             break;
+        case CHAMELEON_LOOKAHEAD:
+            if (value < 1) {
+                chameleon_error("CHAMELEON_Set", "illegal value of CHAMELEON_LOOKAHEAD");
+                return CHAMELEON_ERR_ILLEGAL_VALUE;
+            }
+            chamctxt->lookahead = value;
+            break;
         default:
             chameleon_error("CHAMELEON_Set", "unknown parameter");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
@@ -341,6 +357,7 @@ int CHAMELEON_Set( int param, int value )
  *          Feature to be enabled:
  *          @arg CHAMELEON_TILE_SIZE:        size matrix tile,
  *          @arg CHAMELEON_INNER_BLOCK_SIZE: size of tile inner block,
+ *          @arg CHAMELEON_LOOKAHEAD:        depth of the look ahead in algorithms
  *
  * @param[out] value
  *          Value of the parameter.
@@ -375,6 +392,9 @@ int CHAMELEON_Get(int param, int *value)
         case CHAMELEON_TRANSLATION_MODE:
             *value = chamctxt->translation;
             return CHAMELEON_SUCCESS;
+        case CHAMELEON_LOOKAHEAD:
+            *value = chamctxt->lookahead;
+            return CHAMELEON_SUCCESS;
         default:
             chameleon_error("CHAMELEON_Get", "unknown parameter");
             return CHAMELEON_ERR_ILLEGAL_VALUE;
diff --git a/control/descriptor.c b/control/descriptor.c
index de4bd65c032d661372453d319e9659a58bb1faab..ad2d53f2859da764e7156e9872f5898f985214a3 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -68,6 +68,27 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc )
     return CHAMELEON_SUCCESS;
 }
 
+/**
+ *  Internal function to return MPI rank of element A(m,n) with m,n = block indices
+ */
+int chameleon_getrankof_2d( const CHAM_desc_t *A, int m, int n )
+{
+    int mm = m + A->i / A->mb;
+    int nn = n + A->j / A->nb;
+    return (mm % A->p) * A->q + (nn % A->q);
+}
+
+/**
+ *  Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices
+ */
+int chameleon_getrankof_2d_diag( const CHAM_desc_t *A, int m, int n )
+{
+    int mm = m + A->i / A->mb;
+    assert( m == n );
+    return (mm % A->p) * A->q + (mm % A->q);
+}
+
+
 /**
  ******************************************************************************
  *
diff --git a/control/descriptor.h b/control/descriptor.h
index 2be4cafe9c59f7a13042b54e6a99bf0b265bd00c..6eff677110b0e4269a5366cc7ffcf24269050547 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -44,8 +44,8 @@ inline static int   chameleon_getblkldd_ccrb(const CHAM_desc_t *A, int m);
 /**
  *  Data distributions
  */
-inline static int   chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n);
-inline static int   chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n);
+int          chameleon_getrankof_2d(const CHAM_desc_t *desc, int m, int n);
+int          chameleon_getrankof_2d_diag(const CHAM_desc_t *desc, int m, int n);
 
 int          chameleon_desc_init     ( CHAM_desc_t *desc, void *mat,
                                        cham_flttype_t dtyp, int mb, int nb, int bsiz,
@@ -175,27 +175,6 @@ inline static int chameleon_getblkldd_cm(const CHAM_desc_t *A, int m) {
     return A->llm;
 }
 
-/**
- *  Internal function to return MPI rank of element A(m,n) with m,n = block indices
- */
-inline static int chameleon_getrankof_2d(const CHAM_desc_t *A, int m, int n)
-{
-    int mm = m + A->i / A->mb;
-    int nn = n + A->j / A->nb;
-    return (mm % A->p) * A->q + (nn % A->q);
-}
-
-/**
- *  Internal function to return MPI rank of element DIAG(m,0) with m,n = block indices
- */
-inline static int chameleon_getrankof_2d_diag(const CHAM_desc_t *A, int m, int n)
-{
-    int mm = m + A->i / A->mb;
-    assert( m == n );
-    return (mm % A->p) * A->q + (mm % A->q);
-}
-
-
 /**
  * Detect if the tile is local or not
  */
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index ec7ea257144d0f1fc0730feb2f5817b98cd956f0..968a259cc3470d1f409eeea0b1b2153e06c7e89b 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -182,6 +182,7 @@ typedef enum chameleon_store_e {
 #define CHAMELEON_BOUND           7
 #define CHAMELEON_PROGRESS        8
 #define CHAMELEON_GEMM3M          9
+#define CHAMELEON_GENERIC        10
 
 /**
  *  CHAMELEON constants - configuration parameters
@@ -192,6 +193,7 @@ typedef enum chameleon_store_e {
 #define CHAMELEON_HOUSEHOLDER_MODE  5
 #define CHAMELEON_HOUSEHOLDER_SIZE  6
 #define CHAMELEON_TRANSLATION_MODE  7
+#define CHAMELEON_LOOKAHEAD         8
 
 /**
  * @brief QR/LQ factorization trees
diff --git a/include/chameleon/struct.h b/include/chameleon/struct.h
index 51315359bc8b6dad57bf577e9ea23e6bd6c6cada..cbd860c6da74bc5b5d5205506125f92d08c04720 100644
--- a/include/chameleon/struct.h
+++ b/include/chameleon/struct.h
@@ -116,6 +116,7 @@ typedef struct chameleon_context_s {
     cham_bool_t        parallel_enabled;
     cham_bool_t        profiling_enabled;
     cham_bool_t        progress_enabled;
+    cham_bool_t        generic_enabled;
 
     cham_householder_t householder;        // "domino" (flat) or tree-based (reduction) Householder
     cham_translation_t translation;        // In place or Out of place layout conversion
@@ -123,6 +124,7 @@ typedef struct chameleon_context_s {
     int                nb;
     int                ib;
     int                rhblock;            // block size for tree-based (reduction) Householder
+    int                lookahead;          // depth of the look ahead in algorithms
     void              *schedopt;           // structure for runtimes
     int                mpi_outer_init;     // MPI has been initialized outside our functions
 } CHAM_context_t;
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index 69a29fe31a1d1eba37d6e43be438b201c6e3d651..002be36104b8f3bb59ab7012bf31e35a4336716d 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -85,6 +85,8 @@ void RUNTIME_enable( void *runtime_ctxt, int lever )
     default:
         return;
     }
+
+    (void)runtime_ctxt;
     return;
 }
 
@@ -107,5 +109,7 @@ void RUNTIME_disable( void *runtime_ctxt, int lever )
     default:
         return;
     }
+
+    (void)runtime_ctxt;
     return;
 }
diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c
index acba26abf071937a0d216ca1c9f3ccde22301955..78d2394efa8d6fa4f2fd0a73ce562119faf5d7a0 100644
--- a/runtime/starpu/control/runtime_profiling.c
+++ b/runtime/starpu/control/runtime_profiling.c
@@ -56,7 +56,7 @@ void RUNTIME_iteration_pop( CHAM_context_t *chamctxt )
 
 void RUNTIME_start_profiling(){
 #if defined(HAVE_STARPU_FXT_PROFILING)
-	starpu_fxt_start_profiling();
+    starpu_fxt_start_profiling();
 #else
     fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
 #endif
@@ -64,7 +64,7 @@ void RUNTIME_start_profiling(){
 
 void RUNTIME_stop_profiling(){
 #if defined(HAVE_STARPU_FXT_PROFILING)
-	starpu_fxt_stop_profiling();
+    starpu_fxt_stop_profiling();
 #else
     fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
 #endif
diff --git a/testing/testing_dgram.c b/testing/testing_dgram.c
index c9ca38a3f7a3475e257f41d2a0c8343083efabcb..0bb2c73039c7b890fa8f674f86a3122e3ffa71ca 100644
--- a/testing/testing_dgram.c
+++ b/testing/testing_dgram.c
@@ -212,7 +212,6 @@ static int compute_gram_sequential(cham_uplo_t uplo,
                                    int LDA)
 {
     int m, n;
-    double eps;
     double squareij, mean_dij, mhalf;
 
     double *work = (double *)malloc(N * sizeof(double));
@@ -258,4 +257,4 @@ static int compute_gram_sequential(cham_uplo_t uplo,
     free(work);
 
     return 0;
-}
\ No newline at end of file
+}
diff --git a/timing/CMakeLists.txt b/timing/CMakeLists.txt
index cbe827665a7af49d0f1e77d7445b0ca350937a5a..8167545fd74905693d3d4c4d2ba9ef9d516eb952 100644
--- a/timing/CMakeLists.txt
+++ b/timing/CMakeLists.txt
@@ -103,6 +103,8 @@ set(ZSRC_LAP_INT
 set(ZSRC_TIL_INT
   # BLAS 3
   time_zgemm_tile.c
+  time_zhemm_tile.c
+  time_zsymm_tile.c
   # LAPACK
   time_zgels_tile.c
   time_zgeqrf_hqr_tile.c
diff --git a/timing/time_zhemm_tile.c b/timing/time_zhemm_tile.c
new file mode 100644
index 0000000000000000000000000000000000000000..04c16208a2745494de2e9aa2abf84e4194a16d74
--- /dev/null
+++ b/timing/time_zhemm_tile.c
@@ -0,0 +1,87 @@
+/**
+ *
+ * @file time_zhemm_tile.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @date 2014-11-16
+ * @precisions normal z -> c
+ *
+ */
+#define _TYPE  CHAMELEON_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "CHAMELEON_zhemm_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_HEMM( ChamLeft, M, N )
+#define _FADDS FADDS_HEMM( ChamLeft, M, N )
+
+#include "./timing.c"
+#include "timing_zauxiliary.h"
+
+static int
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
+{
+    CHAMELEON_Complex64_t alpha, beta;
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    LDA = chameleon_max(M, iparam[IPARAM_LDA]);
+    LDB = chameleon_max(M, iparam[IPARAM_LDB]);
+    LDC = chameleon_max(M, iparam[IPARAM_LDC]);
+
+    /* Allocate Data */
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N );
+
+    /* Initialize Data */
+    CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 );
+    CHAMELEON_zplrnt_Tile( descB, 7672 );
+    CHAMELEON_zplrnt_Tile( descC, 6387 );
+
+#if !defined(CHAMELEON_SIMULATION)
+    LAPACKE_zlarnv_work(1, ISEED, 1, &alpha);
+    LAPACKE_zlarnv_work(1, ISEED, 1, &beta);
+#else
+    alpha = 1.5;
+    beta = -2.3;
+#endif
+
+    /* Save C for check */
+    PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N );
+
+    START_TIMING();
+    CHAMELEON_zhemm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC );
+    STOP_TIMING();
+
+#if !defined(CHAMELEON_SIMULATION)
+    /* Check the solution */
+    if (check)
+    {
+        PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M );
+        PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N );
+        PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N );
+
+        dparam[IPARAM_RES] = z_check_hemm( ChamLeft, ChamUpper, M, N,
+                                           alpha, A, LDA, B, LDB, beta, C, C2, LDC,
+                                           &(dparam[IPARAM_ANORM]),
+                                           &(dparam[IPARAM_BNORM]),
+                                           &(dparam[IPARAM_XNORM]) );
+
+        free(A); free(B); free(C); free(C2);
+    }
+#endif
+
+    PASTE_CODE_FREE_MATRIX( descA );
+    PASTE_CODE_FREE_MATRIX( descB );
+    PASTE_CODE_FREE_MATRIX( descC );
+    return 0;
+}
diff --git a/timing/time_zsymm_tile.c b/timing/time_zsymm_tile.c
new file mode 100644
index 0000000000000000000000000000000000000000..0675e238c999f94e1dc902caded132415e059a6d
--- /dev/null
+++ b/timing/time_zsymm_tile.c
@@ -0,0 +1,87 @@
+/**
+ *
+ * @file time_zsymm_tile.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2019 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @version 0.9.2
+ * @author Mathieu Faverge
+ * @date 2014-11-16
+ * @precisions normal z -> c d s
+ *
+ */
+#define _TYPE  CHAMELEON_Complex64_t
+#define _PREC  double
+#define _LAMCH LAPACKE_dlamch_work
+
+#define _NAME  "CHAMELEON_zsymm_Tile"
+/* See Lawn 41 page 120 */
+#define _FMULS FMULS_SYMM( ChamLeft, M, N )
+#define _FADDS FADDS_SYMM( ChamLeft, M, N )
+
+#include "./timing.c"
+#include "timing_zauxiliary.h"
+
+static int
+RunTest(int *iparam, double *dparam, chameleon_time_t *t_)
+{
+    CHAMELEON_Complex64_t alpha, beta;
+    PASTE_CODE_IPARAM_LOCALS( iparam );
+
+    LDA = chameleon_max(M, iparam[IPARAM_LDA]);
+    LDB = chameleon_max(M, iparam[IPARAM_LDB]);
+    LDC = chameleon_max(M, iparam[IPARAM_LDC]);
+
+    /* Allocate Data */
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descA, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDA, M, M );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descB, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDB, M, N );
+    PASTE_CODE_ALLOCATE_MATRIX_TILE( descC, 1, CHAMELEON_Complex64_t, ChamComplexDouble, LDC, M, N );
+
+    /* Initialize Data */
+    CHAMELEON_zplghe_Tile( 0, ChamUpper, descA, 5373 );
+    CHAMELEON_zplrnt_Tile( descB, 7672 );
+    CHAMELEON_zplrnt_Tile( descC, 6387 );
+
+#if !defined(CHAMELEON_SIMULATION)
+    LAPACKE_zlarnv_work(1, ISEED, 1, &alpha);
+    LAPACKE_zlarnv_work(1, ISEED, 1, &beta);
+#else
+    alpha = 1.5;
+    beta = -2.3;
+#endif
+
+    /* Save C for check */
+    PASTE_TILE_TO_LAPACK( descC, C2, check, CHAMELEON_Complex64_t, LDC, N );
+
+    START_TIMING();
+    CHAMELEON_zsymm_Tile( ChamLeft, ChamUpper, alpha, descA, descB, beta, descC );
+    STOP_TIMING();
+
+#if !defined(CHAMELEON_SIMULATION)
+    /* Check the solution */
+    if (check)
+    {
+        PASTE_TILE_TO_LAPACK( descA, A, check, CHAMELEON_Complex64_t, LDA, M );
+        PASTE_TILE_TO_LAPACK( descB, B, check, CHAMELEON_Complex64_t, LDB, N );
+        PASTE_TILE_TO_LAPACK( descC, C, check, CHAMELEON_Complex64_t, LDC, N );
+
+        dparam[IPARAM_RES] = z_check_symm( ChamLeft, ChamUpper, M, N,
+                                           alpha, A, LDA, B, LDB, beta, C, C2, LDC,
+                                           &(dparam[IPARAM_ANORM]),
+                                           &(dparam[IPARAM_BNORM]),
+                                           &(dparam[IPARAM_XNORM]) );
+
+        free(A); free(B); free(C); free(C2);
+    }
+#endif
+
+    PASTE_CODE_FREE_MATRIX( descA );
+    PASTE_CODE_FREE_MATRIX( descB );
+    PASTE_CODE_FREE_MATRIX( descC );
+    return 0;
+}
diff --git a/timing/timing_zauxiliary.c b/timing/timing_zauxiliary.c
index 007d57c85ef9f7e0b50817a179b3404dcf069141..b8c995e34f2a6b18a60ecd70e2d68ba981c5f1b7 100644
--- a/timing/timing_zauxiliary.c
+++ b/timing/timing_zauxiliary.c
@@ -247,6 +247,70 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int
     return Rnorm;
 }
 
+#if defined(PRECISION_z) || defined(PRECISION_c)
+/*--------------------------------------------------------------
+ * Check the hemm
+ */
+double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
+                     CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                                  const CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  const CHAMELEON_Complex64_t *Ccham,
+                                                        CHAMELEON_Complex64_t *Cref, int LDC,
+                     double *Cinitnorm, double *Cchamnorm, double *Clapacknorm )
+{
+    CHAMELEON_Complex64_t beta_const = -1.0;
+    double Rnorm;
+    double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) );
+
+    *Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref,  LDC, work );
+    *Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work );
+
+    cblas_zhemm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N,
+                 CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC );
+
+    *Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
+
+    cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 );
+
+    Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
+
+    free(work);
+
+    return Rnorm;
+}
+#endif /* defined(PRECISION_z) || defined(PRECISION_c) */
+
+/*--------------------------------------------------------------
+ * Check the symm
+ */
+double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N,
+                     CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                                  const CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  const CHAMELEON_Complex64_t *Ccham,
+                                                        CHAMELEON_Complex64_t *Cref, int LDC,
+                     double *Cinitnorm, double *Cchamnorm, double *Clapacknorm )
+{
+    CHAMELEON_Complex64_t beta_const = -1.0;
+    double Rnorm;
+    double *work = (double *)malloc( chameleon_max(M, N)* sizeof(double) );
+
+    *Cinitnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref,  LDC, work );
+    *Cchamnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Ccham, LDC, work );
+
+    cblas_zsymm( CblasColMajor, (CBLAS_SIDE)side, (CBLAS_UPLO)uplo, M, N,
+                 CBLAS_SADDR(alpha), A, LDA, B, LDB, CBLAS_SADDR(beta), Cref, LDC );
+
+    *Clapacknorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
+
+    cblas_zaxpy( LDC * N, CBLAS_SADDR(beta_const), Ccham, 1, Cref, 1 );
+
+    Rnorm = LAPACKE_zlange_work( LAPACK_COL_MAJOR, 'I', M, N, Cref, LDC, work );
+
+    free(work);
+
+    return Rnorm;
+}
+
 /*--------------------------------------------------------------
  * Check the trsm
  */
diff --git a/timing/timing_zauxiliary.h b/timing/timing_zauxiliary.h
index 6fbb0ff15a89aca664486166c55272a3b50b5468..ed9c7df0bf5f4f13da61317923821c39f38607d4 100644
--- a/timing/timing_zauxiliary.h
+++ b/timing/timing_zauxiliary.h
@@ -28,6 +28,21 @@ double z_check_gemm(cham_trans_t transA, cham_trans_t transB, int M, int N, int
                    CHAMELEON_Complex64_t *Cref, int LDC,
                    double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
 
+#if defined(PRECISION_z) || defined(PRECISION_c)
+double z_check_hemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
+                     CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                                  const CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  const CHAMELEON_Complex64_t *Ccham,
+                                                        CHAMELEON_Complex64_t *Cref, int LDC,
+                     double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
+#endif
+double z_check_symm( cham_side_t side, cham_uplo_t uplo, int M, int N,
+                     CHAMELEON_Complex64_t alpha, const CHAMELEON_Complex64_t *A, int LDA,
+                                                  const CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  const CHAMELEON_Complex64_t *Ccham,
+                                                        CHAMELEON_Complex64_t *Cref, int LDC,
+                     double *Cinitnorm, double *Cchamnorm, double *Clapacknorm );
+
 double z_check_trsm(cham_side_t side, cham_uplo_t uplo, cham_trans_t trans, cham_diag_t diag,
            int M, int NRHS, CHAMELEON_Complex64_t alpha,
            CHAMELEON_Complex64_t *A, int LDA,