diff --git a/compute/pzgemm.c b/compute/pzgemm.c
index 4c41e64438be82566417209028177a56ac26950a..6d98e855dd3be370cbcbb014fdc46962b3a84180 100644
--- a/compute/pzgemm.c
+++ b/compute/pzgemm.c
@@ -24,15 +24,15 @@
  */
 #include "control/common.h"
 
-#define A(m, n) A,  m,  n
-#define B(m, n) B,  m,  n
-#define C(m, n) C,  m,  n
-#define WA(m, n) WA,  m,  n
-#define WB(m, n) WB,  m,  n
+#define A(  _m_, _n_ ) A,  (_m_), (_n_)
+#define B(  _m_, _n_ ) B,  (_m_), (_n_)
+#define C(  _m_, _n_ ) C,  (_m_), (_n_)
+#define WA( _m_, _n_ ) WA, (_m_), (_n_)
+#define WB( _m_, _n_ ) WB, (_m_), (_n_)
 
 /**
  *  Parallel tile matrix-matrix multiplication.
- *  Generic algorithm for any data distribution.
+ *  Generic algorithm for any data distribution with a stationnary A.
  */
 static inline void
 chameleon_pzgemm_Astat( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c
index b50edf517487e469b41f59d3113408d631cea0f6..4d8126f5c5150febf6cd7bc8f5d161f1a92f8010 100644
--- a/compute/pzgepdf_qdwh.c
+++ b/compute/pzgepdf_qdwh.c
@@ -818,7 +818,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
     switch( mtxtype ) {
 #if defined(PRECISION_z) || defined(PRECISION_c)
     case ChamHermitian:
-        chameleon_pzhemm( ChamRight, ChamUpper,
+        chameleon_pzhemm( gemm_ws, ChamRight, ChamUpper,
                           1., descU, &descA,
                           0., descH, sequence, request );
         if ( info ) {
@@ -827,7 +827,7 @@ chameleon_pzgepdf_qdwh( cham_mtxtype_t mtxtype, CHAM_desc_t *descU, CHAM_desc_t
         break;
 #endif
     case ChamSymmetric:
-        chameleon_pzsymm( ChamRight, ChamUpper,
+        chameleon_pzsymm( gemm_ws, ChamRight, ChamUpper,
                           1., descU, &descA,
                           0., descH, sequence, request );
         if ( info ) {
diff --git a/compute/pzhemm.c b/compute/pzhemm.c
index 75f1ab66be514b1eaf22f0f47efd8321cbcebc31..24f8bb2be7ce21c1db45d8582c0ef2d1572d3930 100644
--- a/compute/pzhemm.c
+++ b/compute/pzhemm.c
@@ -30,6 +30,257 @@
 #define WA( _m_, _n_ ) WA, (_m_), (_n_)
 #define WB( _m_, _n_ ) WB, (_m_), (_n_)
 
+/**
+ *  Parallel tile matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution with a stationnary A.
+ *
+ * Assuming A has been setup with a proper getrank function to account for symmetry
+ */
+static inline void
+chameleon_pzhemm_Astat( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
+{
+    const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    int                 k, m, n, l, Am, An;
+    int                 tempmm, tempnn, tempkn, tempkm;
+    int                 myrank = RUNTIME_comm_rank( chamctxt );
+    int                 reduceC[ C->mt * C->nt ];
+
+    /* Set C tiles to redux mode */
+    for (n = 0; n < C->nt; n++) {
+        for (m = 0; m < C->mt; m++) {
+            reduceC[ n * C->mt + m ] = 0;
+
+            /* The node owns the C tile. */
+            if ( C->get_rankof( C(m, n) ) == myrank ) {
+                reduceC[ n * C->mt + m ] = 1;
+                RUNTIME_zgersum_set_methods( C(m, n) );
+                continue;
+            }
+
+            /*
+             * The node owns the A tile that will define the locality of the
+             * computations.
+             */
+            /* Select row or column based on side */
+            l = ( side == ChamLeft ) ? m : n;
+
+            if ( uplo == ChamLower ) {
+                for (k = 0; k < A->mt; k++) {
+                    Am = k;
+                    An = k;
+
+                    if (k < l) {
+                        Am = l;
+                    }
+                    else if (k > l) {
+                        An = l;
+                    }
+
+                    if ( A->get_rankof( A( Am, An ) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+            else {
+                for (k = 0; k < A->mt; k++) {
+                    Am = k;
+                    An = k;
+
+                    if (k < l) {
+                        An = l;
+                    }
+                    else if (k > l) {
+                        Am = l;
+                    }
+
+                    if ( A->get_rankof( A( Am, An ) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    for(n = 0; n < C->nt; n++) {
+        tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+        for(m = 0; m < C->mt; m++) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+
+            /* Scale C */
+            options->forcesub = 0;
+            INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb,
+                                 beta, C, m, n );
+            options->forcesub = reduceC[ n * C->mt + m ];
+
+            /*
+             *  ChamLeft / ChamLower
+             */
+            /* Select row or column based on side */
+            l = ( side == ChamLeft ) ? m : n;
+
+            if (side == ChamLeft) {
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+
+                        if (k < m) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(m, k),  /* lda * K */
+                                       B(k, n),  /* ldb * Y */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                        else if (k == m) {
+                                INSERT_TASK_zhemm_Astat(
+                                    options,
+                                    side, uplo,
+                                    tempmm, tempnn, A->mb,
+                                    alpha, A(k, k),  /* ldak * X */
+                                           B(k, n),  /* ldb  * Y */
+                                    zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(k, m),  /* ldak * X */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                    }
+                }
+                /*
+                 *  ChamLeft / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+
+                        if (k < m) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(k, m),  /* ldak * X */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else if (k == m) {
+                            INSERT_TASK_zhemm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * K */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(m, k),  /* lda * K */
+                                       B(k, n),  /* ldb * Y */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                    }
+                }
+            }
+            /*
+             *  ChamRight / ChamLower
+             */
+            else {
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+
+                        if (k < n) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb * K */
+                                       A(n, k),  /* lda * K */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                        else if (k == n) {
+                            INSERT_TASK_zhemm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * Y */
+                                       B(m, k),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb  * K */
+                                       A(k, n),  /* ldak * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                    }
+                }
+                /*
+                 *  ChamRight / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+
+                        if (k < n) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb  * K */
+                                       A(k, n),  /* ldak * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else if (k == n) {
+                            INSERT_TASK_zhemm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * Y */
+                                       B(m, k),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb * K */
+                                       A(n, k),  /* lda * K */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                    }
+                }
+            }
+
+            RUNTIME_zgersum_submit_tree( options, C(m, n) );
+            RUNTIME_data_flush( sequence, C(m, n) );
+        }
+    }
+    options->forcesub = 0;
+    (void)chamctxt;
+}
+
+
 /**
  *  Parallel tile hermitian matrix-matrix multiplication.
  *  SUMMA algorithm for 2D block-cyclic data distribution.
@@ -310,39 +561,22 @@ static inline void
 chameleon_pzhemm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
                         CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                         CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        CHAM_desc_t *WA, CHAM_desc_t *WB,
                         RUNTIME_option_t *options )
 {
     RUNTIME_sequence_t *sequence = options->sequence;
-    CHAM_desc_t WA, WB;
-    int lookahead;
-
-    lookahead = chamctxt->lookahead;
-    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
-                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
-                         NULL, NULL, NULL );
-    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
-                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
-                         NULL, NULL, NULL );
 
     if (side == ChamLeft) {
         chameleon_pzhemm_summa_left( chamctxt, uplo, alpha, A, B, beta, C,
-                                     &WA, &WB, options );
+                                     WA, WB, options );
     }
     else {
         chameleon_pzhemm_summa_right( chamctxt, uplo, alpha, A, B, beta, C,
-                                      &WA, &WB, options );
+                                      WA, WB, options );
     }
 
-    RUNTIME_desc_flush( &WA, sequence );
-    RUNTIME_desc_flush( &WB, sequence );
-    RUNTIME_desc_flush(  C,  sequence );
-    chameleon_sequence_wait( chamctxt, sequence );
-    chameleon_desc_destroy( &WA );
-    chameleon_desc_destroy( &WB );
+    CHAMELEON_Desc_Flush( WA, sequence );
+    CHAMELEON_Desc_Flush( WB, sequence );
 }
 
 /**
@@ -530,13 +764,15 @@ chameleon_pzhemm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_
  *  Parallel tile hermitian matrix-matrix multiplication. wrapper.
  */
 void
-chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo,
+chameleon_pzhemm( struct chameleon_pzgemm_s *ws,
+                  cham_side_t side, cham_uplo_t uplo,
                   CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                   CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
                   RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
+    cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -544,15 +780,26 @@ chameleon_pzhemm( cham_side_t side, cham_uplo_t uplo,
     }
     RUNTIME_options_init( &options, chamctxt, sequence, request );
 
-    if ( ((C->p > 1) || (C->q > 1)) &&
-         (C->get_rankof == chameleon_getrankof_2d) &&
-         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
-    {
-        chameleon_pzhemm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
-    }
-    else
-    {
+    switch( alg ) {
+    case ChamGemmAlgAuto:
+    case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */
+    case ChamGemmAlgGeneric:
         chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        break;
+
+    case ChamGemmAlgSummaC:
+        chameleon_pzhemm_summa( chamctxt, side, uplo, alpha, A, B, beta, C,
+                                &(ws->WA), &(ws->WB), &options );
+        break;
+
+    case ChamGemmAlgSummaA:
+        if ( side == ChamLeft ) {
+            chameleon_pzhemm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        else {
+            chameleon_pzhemm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        break;
     }
 
     RUNTIME_options_finalize( &options, chamctxt );
diff --git a/compute/pzsymm.c b/compute/pzsymm.c
index 2f2fa1997472bc2fc56002b5bc7cf1df12d48676..e39d9f6e298a42be3b8516121cbed6c14afaee3c 100644
--- a/compute/pzsymm.c
+++ b/compute/pzsymm.c
@@ -30,6 +30,257 @@
 #define WA( _m_, _n_ ) WA, (_m_), (_n_)
 #define WB( _m_, _n_ ) WB, (_m_), (_n_)
 
+/**
+ *  Parallel tile matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution with a stationnary A.
+ *
+ * Assuming A has been setup with a proper getrank function to account for symmetry
+ */
+static inline void
+chameleon_pzsymm_Astat( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
+{
+    const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    int                 k, m, n, l, Am, An;
+    int                 tempmm, tempnn, tempkn, tempkm;
+    int                 myrank = RUNTIME_comm_rank( chamctxt );
+    int                 reduceC[ C->mt * C->nt ];
+
+    /* Set C tiles to redux mode */
+    for (n = 0; n < C->nt; n++) {
+        for (m = 0; m < C->mt; m++) {
+            reduceC[ n * C->mt + m ] = 0;
+
+            /* The node owns the C tile. */
+            if ( C->get_rankof( C(m, n) ) == myrank ) {
+                reduceC[ n * C->mt + m ] = 1;
+                RUNTIME_zgersum_set_methods( C(m, n) );
+                continue;
+            }
+
+            /*
+             * The node owns the A tile that will define the locality of the
+             * computations.
+             */
+            /* Select row or column based on side */
+            l = ( side == ChamLeft ) ? m : n;
+
+            if ( uplo == ChamLower ) {
+                for (k = 0; k < A->mt; k++) {
+                    Am = k;
+                    An = k;
+
+                    if (k < l) {
+                        Am = l;
+                    }
+                    else if (k > l) {
+                        An = l;
+                    }
+
+                    if ( A->get_rankof( A( Am, An ) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+            else {
+                for (k = 0; k < A->mt; k++) {
+                    Am = k;
+                    An = k;
+
+                    if (k < l) {
+                        An = l;
+                    }
+                    else if (k > l) {
+                        Am = l;
+                    }
+
+                    if ( A->get_rankof( A( Am, An ) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    for(n = 0; n < C->nt; n++) {
+        tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+        for(m = 0; m < C->mt; m++) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+
+            /* Scale C */
+            options->forcesub = 0;
+            INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb,
+                                 beta, C, m, n );
+            options->forcesub = reduceC[ n * C->mt + m ];
+
+            /*
+             *  ChamLeft / ChamLower
+             */
+            /* Select row or column based on side */
+            l = ( side == ChamLeft ) ? m : n;
+
+            if (side == ChamLeft) {
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+
+                        if (k < m) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(m, k),  /* lda * K */
+                                       B(k, n),  /* ldb * Y */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                        else if (k == m) {
+                                INSERT_TASK_zsymm_Astat(
+                                    options,
+                                    side, uplo,
+                                    tempmm, tempnn, A->mb,
+                                    alpha, A(k, k),  /* ldak * X */
+                                           B(k, n),  /* ldb  * Y */
+                                    zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(k, m),  /* ldak * X */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                    }
+                }
+                /*
+                 *  ChamLeft / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->mt; k++) {
+                        tempkm = k == C->mt-1 ? C->m-k*C->mb : C->mb;
+
+                        if (k < m) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(k, m),  /* ldak * X */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else if (k == m) {
+                            INSERT_TASK_zsymm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * K */
+                                       B(k, n),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkm, A->mb,
+                                alpha, A(m, k),  /* lda * K */
+                                       B(k, n),  /* ldb * Y */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                    }
+                }
+            }
+            /*
+             *  ChamRight / ChamLower
+             */
+            else {
+                if (uplo == ChamLower) {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+
+                        if (k < n) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb * K */
+                                       A(n, k),  /* lda * K */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                        else if (k == n) {
+                            INSERT_TASK_zsymm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * Y */
+                                       B(m, k),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb  * K */
+                                       A(k, n),  /* ldak * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                    }
+                }
+                /*
+                 *  ChamRight / ChamUpper
+                 */
+                else {
+                    for (k = 0; k < C->nt; k++) {
+                        tempkn = k == C->nt-1 ? C->n-k*C->nb : C->nb;
+
+                        if (k < n) {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamNoTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb  * K */
+                                       A(k, n),  /* ldak * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else if (k == n) {
+                            INSERT_TASK_zsymm_Astat(
+                                options,
+                                side, uplo,
+                                tempmm, tempnn, A->mb,
+                                alpha, A(k, k),  /* ldak * Y */
+                                       B(m, k),  /* ldb  * Y */
+                                zone,  C(m, n)); /* ldc  * Y */
+                        }
+                        else {
+                            INSERT_TASK_zgemm_Astat(
+                                options,
+                                ChamNoTrans, ChamTrans,
+                                tempmm, tempnn, tempkn, A->mb,
+                                alpha, B(m, k),  /* ldb * K */
+                                       A(n, k),  /* lda * K */
+                                zone,  C(m, n)); /* ldc * Y */
+                        }
+                    }
+                }
+            }
+
+            RUNTIME_zgersum_submit_tree( options, C(m, n) );
+            RUNTIME_data_flush( sequence, C(m, n) );
+        }
+    }
+    options->forcesub = 0;
+    (void)chamctxt;
+}
+
+
 /**
  *  Parallel tile symmetric matrix-matrix multiplication.
  *  SUMMA algorithm for 2D block-cyclic data distribution.
@@ -310,39 +561,22 @@ static inline void
 chameleon_pzsymm_summa( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_t uplo,
                         CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                         CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        CHAM_desc_t *WA, CHAM_desc_t *WB,
                         RUNTIME_option_t *options )
 {
     RUNTIME_sequence_t *sequence = options->sequence;
-    CHAM_desc_t WA, WB;
-    int lookahead;
-
-    lookahead = chamctxt->lookahead;
-    chameleon_desc_init( &WA, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
-                         C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
-                         NULL, NULL, NULL );
-    chameleon_desc_init( &WB, CHAMELEON_MAT_ALLOC_TILE,
-                         ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
-                         C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
-                         C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
-                         NULL, NULL, NULL );
 
     if (side == ChamLeft) {
         chameleon_pzsymm_summa_left( chamctxt, uplo, alpha, A, B, beta, C,
-                                     &WA, &WB, options );
+                                     WA, WB, options );
     }
     else {
         chameleon_pzsymm_summa_right( chamctxt, uplo, alpha, A, B, beta, C,
-                                      &WA, &WB, options );
+                                      WA, WB, options );
     }
 
-    RUNTIME_desc_flush( &WA, sequence );
-    RUNTIME_desc_flush( &WB, sequence );
-    RUNTIME_desc_flush(  C,  sequence );
-    chameleon_sequence_wait( chamctxt, sequence );
-    chameleon_desc_destroy( &WA );
-    chameleon_desc_destroy( &WB );
+    CHAMELEON_Desc_Flush( WA, sequence );
+    CHAMELEON_Desc_Flush( WB, sequence );
 }
 
 /**
@@ -530,13 +764,15 @@ chameleon_pzsymm_generic( CHAM_context_t *chamctxt, cham_side_t side, cham_uplo_
  *  Parallel tile symmetric matrix-matrix multiplication. wrapper.
  */
 void
-chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
+chameleon_pzsymm( struct chameleon_pzgemm_s *ws,
+                  cham_side_t side, cham_uplo_t uplo,
                   CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
                   CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
                   RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
+    cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -544,15 +780,26 @@ chameleon_pzsymm( cham_side_t side, cham_uplo_t uplo,
     }
     RUNTIME_options_init( &options, chamctxt, sequence, request );
 
-    if ( ((C->p > 1) || (C->q > 1)) &&
-         (C->get_rankof == chameleon_getrankof_2d) &&
-         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
-    {
-        chameleon_pzsymm_summa(   chamctxt, side, uplo, alpha, A, B, beta, C, &options );
-    }
-    else
-    {
+    switch( alg ) {
+    case ChamGemmAlgAuto:
+    case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */
+    case ChamGemmAlgGeneric:
         chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        break;
+
+    case ChamGemmAlgSummaC:
+        chameleon_pzsymm_summa( chamctxt, side, uplo, alpha, A, B, beta, C,
+                                &(ws->WA), &(ws->WB), &options );
+        break;
+
+    case ChamGemmAlgSummaA:
+        if ( side == ChamLeft ) {
+            chameleon_pzsymm_Astat( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        else {
+            chameleon_pzsymm_generic( chamctxt, side, uplo, alpha, A, B, beta, C, &options );
+        }
+        break;
     }
 
     RUNTIME_options_finalize( &options, chamctxt );
diff --git a/compute/zgemm.c b/compute/zgemm.c
index 325f27cd90dc3cbf8dee13f26020f613e98a501a..9ce47be609ea6784527cba041a1cefbead3409e7 100644
--- a/compute/zgemm.c
+++ b/compute/zgemm.c
@@ -90,8 +90,8 @@
  */
 void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t       transA __attribute__((unused)),
                                 cham_trans_t       transB __attribute__((unused)),
-                                const CHAM_desc_t *A      __attribute__((unused)),
-                                const CHAM_desc_t *B      __attribute__((unused)),
+                                const CHAM_desc_t *A,
+                                const CHAM_desc_t *B,
                                 const CHAM_desc_t *C )
 {
     CHAM_context_t *chamctxt;
@@ -301,7 +301,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws )
  */
 int CHAMELEON_zgemm( cham_trans_t transA, cham_trans_t transB, int M, int N, int K,
                      CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
-                     CHAMELEON_Complex64_t *B, int LDB,
+                                                  CHAMELEON_Complex64_t *B, int LDB,
                      CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
 {
     int NB;
@@ -496,7 +496,6 @@ int CHAMELEON_zgemm_Tile( cham_trans_t transA, cham_trans_t transB,
     CHAMELEON_Desc_Flush( C, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
-
     CHAMELEON_zgemm_WS_Free( ws );
 
     status = sequence->status;
diff --git a/compute/zhemm.c b/compute/zhemm.c
index 422bf9eecef6ea09acdff520e99751f5560b7e01..23c7fdf985ab84b2e604ca5dec633556ff35e68c 100644
--- a/compute/zhemm.c
+++ b/compute/zhemm.c
@@ -24,6 +24,181 @@
  */
 #include "control/common.h"
 
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Allocate the required workspaces for asynchronous hemm
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *          Specifies whether the hermitian matrix A appears on the
+ *          left or right in the operation as follows:
+ *          = ChamLeft:      \f[ C = \alpha \times A \times B + \beta \times C \f]
+ *          = ChamRight:     \f[ C = \alpha \times B \times A + \beta \times C \f]
+ *
+ * @param[in] uplo
+ *          Specifies whether the upper or lower triangular part of
+ *          the hermitian matrix A is to be referenced as follows:
+ *          = ChamLower: Only the lower triangular part of the
+ *                hermitian matrix A is to be referenced.
+ *          = ChamUpper: Only the upper triangular part of the
+ *                hermitian matrix A is to be referenced.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] B
+ *          The descriptor of the matrix B.
+ *
+ * @param[in] C
+ *          The descriptor of the matrix C.
+ *
+ *******************************************************************************
+ *
+ * @retval An allocated opaque pointer to use in CHAMELEON_zhemm_Tile_Async()
+ * and to free with CHAMELEON_zhemm_WS_Free().
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhemm_Tile_Async
+ * @sa CHAMELEON_zhemm_WS_Free
+ *
+ */
+void *CHAMELEON_zhemm_WS_Alloc( cham_side_t        side __attribute__((unused)),
+                                cham_uplo_t        uplo __attribute__((unused)),
+                                const CHAM_desc_t *A,
+                                const CHAM_desc_t *B,
+                                const CHAM_desc_t *C )
+{
+    CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *options;
+
+    chamctxt = chameleon_context_self();
+    if ( chamctxt == NULL ) {
+        return NULL;
+    }
+
+    options = calloc( 1, sizeof(struct chameleon_pzgemm_s) );
+    options->alg = ChamGemmAlgAuto;
+
+    /*
+     * If only one process, or if generic has been globally enforced, we switch
+     * to generic immediately.
+     */
+    if ( ((C->p == 1) && (C->q == 1)) ||
+         (chamctxt->generic_enabled == CHAMELEON_TRUE) )
+    {
+        options->alg = ChamGemmAlgGeneric;
+    }
+
+    /* Look at environment variable is something enforces the variant. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" );
+
+        if ( algostr ) {
+            if ( strcasecmp( algostr, "summa_c" ) == 0 ) {
+                options->alg = ChamGemmAlgSummaC;
+            }
+            else if ( strcasecmp( algostr, "summa_a" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else if ( strcasecmp( algostr, "summa_b" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaB;
+            }
+            else if ( strcasecmp( algostr, "generic" ) == 0  ) {
+                options->alg = ChamGemmAlgGeneric;
+            }
+            else if ( strcasecmp( algostr, "auto" ) == 0  ) {
+                options->alg = ChamGemmAlgAuto;
+            }
+            else {
+                fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" );
+            }
+        }
+        chameleon_cleanenv( algostr );
+    }
+
+    /* Perform automatic choice if not already enforced. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        double sizeA, sizeB, sizeC;
+        double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */
+
+        /* Compute the average array per node for each matrix */
+        sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q);
+        sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q);
+        sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio;
+
+        if ( (sizeC > sizeA) && (sizeC > sizeB) ) {
+            options->alg = ChamGemmAlgSummaC;
+        }
+        else {
+            if ( sizeA > sizeB ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else {
+                options->alg = ChamGemmAlgSummaB;
+            }
+        }
+    }
+
+    assert( options->alg != ChamGemmAlgAuto );
+
+    /* Now that we have decided which algorithm, let's allocate the required data structures. */
+    if ( (options->alg == ChamGemmAlgSummaC ) &&
+         (C->get_rankof == chameleon_getrankof_2d ) )
+    {
+        int lookahead = chamctxt->lookahead;
+
+        chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                             C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                             NULL, NULL, NULL );
+        chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                             C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                             NULL, NULL, NULL );
+    }
+
+    return (void*)options;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Free the allocated workspaces for asynchronous hemm
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] user_ws
+ *          On entry, the opaque pointer allocated by CHAMELEON_zhemm_WS_Alloc()
+ *          On exit, all data are freed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zhemm_Tile_Async
+ * @sa CHAMELEON_zhemm_WS_Alloc
+ *
+ */
+void CHAMELEON_zhemm_WS_Free( void *user_ws )
+{
+    struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws;
+
+    if ( ws->alg == ChamGemmAlgSummaC ) {
+        chameleon_desc_destroy( &(ws->WA) );
+        chameleon_desc_destroy( &(ws->WB) );
+    }
+    free( ws );
+}
+
 /**
  ********************************************************************************
  *
@@ -102,9 +277,9 @@
  *
  */
 int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
-                 CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
-                 CHAMELEON_Complex64_t *B, int LDB,
-                 CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
+                     CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
+                                                  CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
 {
     int NB;
     int Am;
@@ -115,6 +290,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -158,7 +334,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
         ((alpha == (CHAMELEON_Complex64_t)0.0) && beta == (CHAMELEON_Complex64_t)1.0))
         return CHAMELEON_SUCCESS;
 
-    /* Tune NB depending on M, N & NRHS; Set NBNBSIZE */
+    /* Tune NB depending on M, N & NRHS; Set NBNB */
     status = chameleon_tune(CHAMELEON_FUNC_ZHEMM, M, N, 0);
     if (status != CHAMELEON_SUCCESS) {
         chameleon_error("CHAMELEON_zhemm", "chameleon_tune() failed");
@@ -179,7 +355,8 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
                      C, NB, NB, LDC, N, M,  N, sequence, &request );
 
     /* Call the tile interface */
-    CHAMELEON_zhemm_Tile_Async(  side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request );
+    ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt );
+    CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request );
 
     /* Submit the matrix conversion back */
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
@@ -192,6 +369,7 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     chameleon_sequence_wait( chamctxt, sequence );
 
     /* Cleanup the temporary data */
+    CHAMELEON_zhemm_WS_Free( ws );
     chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
     chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt );
     chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt );
@@ -260,13 +438,14 @@ int CHAMELEON_zhemm( cham_side_t side, cham_uplo_t uplo, int M, int N,
  *
  */
 int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo,
-                      CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                      CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
     int status;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -275,13 +454,16 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo,
     }
     chameleon_sequence_create( chamctxt, &sequence );
 
-    CHAMELEON_zhemm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request );
+    ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C );
+    CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request );
 
     CHAMELEON_Desc_Flush( A, sequence );
     CHAMELEON_Desc_Flush( B, sequence );
     CHAMELEON_Desc_Flush( C, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
+    CHAMELEON_zhemm_WS_Free( ws );
+
     status = sequence->status;
     chameleon_sequence_destroy( chamctxt, sequence );
     return status;
@@ -316,11 +498,13 @@ int CHAMELEON_zhemm_Tile( cham_side_t side, cham_uplo_t uplo,
  *
  */
 int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
-                            CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                            CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                            RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+                                CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                                CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                                void *user_ws,
+                                RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -391,16 +575,6 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
     }
 
     /* Check submatrix starting point */
-    /* if ( (B->i != C->i) || (B->j != C->j) ) { */
-    /*     chameleon_error("CHAMELEON_zhemm_Tile_Async", "B and C submatrices doesn't match"); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
-    /* if ( (A->i != A->j) ||  */
-    /*          ( (side == ChamLeft)  && (A->i != B->i ) ) ||  */
-    /*          ( (side == ChamRight) && (A->i != B->j ) ) ) { */
-    /*     chameleon_error("CHAMELEON_zhemm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
     if( (A->i != 0) || (A->j != 0) ||
         (B->i != 0) || (B->j != 0) ||
         (C->i != 0) || (C->j != 0) ) {
@@ -415,7 +589,21 @@ int CHAMELEON_zhemm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
         return CHAMELEON_SUCCESS;
     }
 
-    chameleon_pzhemm( side, uplo, alpha, A, B, beta, C, sequence, request );
+    if ( user_ws == NULL ) {
+        ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, A, B, C );
+    }
+    else {
+        ws = user_ws;
+    }
 
+    chameleon_pzhemm( ws, side, uplo, alpha, A, B, beta, C, sequence, request );
+
+    if ( user_ws == NULL ) {
+        CHAMELEON_Desc_Flush( A, sequence );
+        CHAMELEON_Desc_Flush( B, sequence );
+        CHAMELEON_Desc_Flush( C, sequence );
+        chameleon_sequence_wait( chamctxt, sequence );
+        CHAMELEON_zhemm_WS_Free( ws );
+    }
     return CHAMELEON_SUCCESS;
 }
diff --git a/compute/zsymm.c b/compute/zsymm.c
index c4ff5fb0ea3f8cb1bf8bb0321f2f62f52d0d33bb..397c8f65c7f4387cc4a3fd106538a2fa58ab3325 100644
--- a/compute/zsymm.c
+++ b/compute/zsymm.c
@@ -24,6 +24,181 @@
  */
 #include "control/common.h"
 
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Allocate the required workspaces for asynchronous symm
+ *
+ *******************************************************************************
+ *
+ * @param[in] side
+ *          Specifies whether the symmetric matrix A appears on the
+ *          left or right in the operation as follows:
+ *          = ChamLeft:      \f[ C = \alpha \times A \times B + \beta \times C \f]
+ *          = ChamRight:     \f[ C = \alpha \times B \times A + \beta \times C \f]
+ *
+ * @param[in] uplo
+ *          Specifies whether the upper or lower triangular part of
+ *          the symmetric matrix A is to be referenced as follows:
+ *          = ChamLower: Only the lower triangular part of the
+ *                symmetric matrix A is to be referenced.
+ *          = ChamUpper: Only the upper triangular part of the
+ *                symmetric matrix A is to be referenced.
+ *
+ * @param[in] A
+ *          The descriptor of the matrix A.
+ *
+ * @param[in] B
+ *          The descriptor of the matrix B.
+ *
+ * @param[in] C
+ *          The descriptor of the matrix C.
+ *
+ *******************************************************************************
+ *
+ * @retval An allocated opaque pointer to use in CHAMELEON_zsymm_Tile_Async()
+ * and to free with CHAMELEON_zsymm_WS_Free().
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zsymm_Tile_Async
+ * @sa CHAMELEON_zsymm_WS_Free
+ *
+ */
+void *CHAMELEON_zsymm_WS_Alloc( cham_side_t        side __attribute__((unused)),
+                                cham_uplo_t        uplo __attribute__((unused)),
+                                const CHAM_desc_t *A,
+                                const CHAM_desc_t *B,
+                                const CHAM_desc_t *C )
+{
+    CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *options;
+
+    chamctxt = chameleon_context_self();
+    if ( chamctxt == NULL ) {
+        return NULL;
+    }
+
+    options = calloc( 1, sizeof(struct chameleon_pzgemm_s) );
+    options->alg = ChamGemmAlgAuto;
+
+    /*
+     * If only one process, or if generic has been globally enforced, we switch
+     * to generic immediately.
+     */
+    if ( ((C->p == 1) && (C->q == 1)) ||
+         (chamctxt->generic_enabled == CHAMELEON_TRUE) )
+    {
+        options->alg = ChamGemmAlgGeneric;
+    }
+
+    /* Look at environment variable is something enforces the variant. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" );
+
+        if ( algostr ) {
+            if ( strcasecmp( algostr, "summa_c" ) == 0 ) {
+                options->alg = ChamGemmAlgSummaC;
+            }
+            else if ( strcasecmp( algostr, "summa_a" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else if ( strcasecmp( algostr, "summa_b" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaB;
+            }
+            else if ( strcasecmp( algostr, "generic" ) == 0  ) {
+                options->alg = ChamGemmAlgGeneric;
+            }
+            else if ( strcasecmp( algostr, "auto" ) == 0  ) {
+                options->alg = ChamGemmAlgAuto;
+            }
+            else {
+                fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" );
+            }
+        }
+        chameleon_cleanenv( algostr );
+    }
+
+    /* Perform automatic choice if not already enforced. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        double sizeA, sizeB, sizeC;
+        double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */
+
+        /* Compute the average array per node for each matrix */
+        sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q);
+        sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q);
+        sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio;
+
+        if ( (sizeC > sizeA) && (sizeC > sizeB) ) {
+            options->alg = ChamGemmAlgSummaC;
+        }
+        else {
+            if ( sizeA > sizeB ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else {
+                options->alg = ChamGemmAlgSummaB;
+            }
+        }
+    }
+
+    assert( options->alg != ChamGemmAlgAuto );
+
+    /* Now that we have decided which algorithm, let's allocate the required data structures. */
+    if ( (options->alg == ChamGemmAlgSummaC ) &&
+         (C->get_rankof == chameleon_getrankof_2d ) )
+    {
+        int lookahead = chamctxt->lookahead;
+
+        chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mt * C->mb, C->nb * C->q * lookahead, 0, 0,
+                             C->mt * C->mb, C->nb * C->q * lookahead, C->p, C->q,
+                             NULL, NULL, NULL );
+        chameleon_desc_init( &(options->WB), CHAMELEON_MAT_ALLOC_TILE,
+                             ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
+                             C->mb * C->p * lookahead, C->nt * C->nb, 0, 0,
+                             C->mb * C->p * lookahead, C->nt * C->nb, C->p, C->q,
+                             NULL, NULL, NULL );
+    }
+
+    return (void*)options;
+}
+
+/**
+ ********************************************************************************
+ *
+ * @ingroup CHAMELEON_Complex64_t
+ *
+ * @brief Free the allocated workspaces for asynchronous symm
+ *
+ *******************************************************************************
+ *
+ * @param[in,out] user_ws
+ *          On entry, the opaque pointer allocated by CHAMELEON_zsymm_WS_Alloc()
+ *          On exit, all data are freed.
+ *
+ *******************************************************************************
+ *
+ * @sa CHAMELEON_zsymm_Tile_Async
+ * @sa CHAMELEON_zsymm_WS_Alloc
+ *
+ */
+void CHAMELEON_zsymm_WS_Free( void *user_ws )
+{
+    struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws;
+
+    if ( ws->alg == ChamGemmAlgSummaC ) {
+        chameleon_desc_destroy( &(ws->WA) );
+        chameleon_desc_destroy( &(ws->WB) );
+    }
+    free( ws );
+}
+
 /**
  ********************************************************************************
  *
@@ -102,9 +277,9 @@
  *
  */
 int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
-                 CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
-                 CHAMELEON_Complex64_t *B, int LDB,
-                 CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
+                     CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t *A, int LDA,
+                                                  CHAMELEON_Complex64_t *B, int LDB,
+                     CHAMELEON_Complex64_t beta,  CHAMELEON_Complex64_t *C, int LDC )
 {
     int NB;
     int Am;
@@ -115,6 +290,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -179,7 +355,8 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
                      C, NB, NB, LDC, N, M,  N, sequence, &request );
 
     /* Call the tile interface */
-    CHAMELEON_zsymm_Tile_Async(  side, uplo, alpha, &descAt, &descBt, beta, &descCt, sequence, &request );
+    ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, &descAt, &descBt, &descCt );
+    CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, &descAt, &descBt, beta, &descCt, ws, sequence, &request );
 
     /* Submit the matrix conversion back */
     chameleon_ztile2lap( chamctxt, &descAl, &descAt,
@@ -192,6 +369,7 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
     chameleon_sequence_wait( chamctxt, sequence );
 
     /* Cleanup the temporary data */
+    CHAMELEON_zsymm_WS_Free( ws );
     chameleon_ztile2lap_cleanup( chamctxt, &descAl, &descAt );
     chameleon_ztile2lap_cleanup( chamctxt, &descBl, &descBt );
     chameleon_ztile2lap_cleanup( chamctxt, &descCl, &descCt );
@@ -260,13 +438,14 @@ int CHAMELEON_zsymm( cham_side_t side, cham_uplo_t uplo, int M, int N,
  *
  */
 int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
-                      CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                      CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
+                          CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                          CHAMELEON_Complex64_t beta,  CHAM_desc_t *C )
 {
     CHAM_context_t *chamctxt;
     RUNTIME_sequence_t *sequence = NULL;
     RUNTIME_request_t request = RUNTIME_REQUEST_INITIALIZER;
     int status;
+    void *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -275,13 +454,16 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
     }
     chameleon_sequence_create( chamctxt, &sequence );
 
-    CHAMELEON_zsymm_Tile_Async(side, uplo, alpha, A, B, beta, C, sequence, &request );
+    ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C );
+    CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, A, B, beta, C, ws, sequence, &request );
 
     CHAMELEON_Desc_Flush( A, sequence );
     CHAMELEON_Desc_Flush( B, sequence );
     CHAMELEON_Desc_Flush( C, sequence );
 
     chameleon_sequence_wait( chamctxt, sequence );
+    CHAMELEON_zsymm_WS_Free( ws );
+
     status = sequence->status;
     chameleon_sequence_destroy( chamctxt, sequence );
     return status;
@@ -316,11 +498,13 @@ int CHAMELEON_zsymm_Tile( cham_side_t side, cham_uplo_t uplo,
  *
  */
 int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
-                            CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
-                            CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
-                            RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
+                                CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                                CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                                void *user_ws,
+                                RUNTIME_sequence_t *sequence, RUNTIME_request_t *request )
 {
     CHAM_context_t *chamctxt;
+    struct chameleon_pzgemm_s *ws;
 
     chamctxt = chameleon_context_self();
     if (chamctxt == NULL) {
@@ -391,16 +575,6 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
     }
 
     /* Check submatrix starting point */
-    /* if ( (B->i != C->i) || (B->j != C->j) ) { */
-    /*     chameleon_error("CHAMELEON_zsymm_Tile_Async", "B and C submatrices doesn't match"); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
-    /* if ( (A->i != A->j) ||  */
-    /*          ( (side == ChamLeft)  && (A->i != B->i ) ) ||  */
-    /*          ( (side == ChamRight) && (A->i != B->j ) ) ) { */
-    /*     chameleon_error("CHAMELEON_zsymm_Tile_Async", "Submatrix A must start on diagnonal and match submatrices B and C."); */
-    /*     return chameleon_request_fail(sequence, request, CHAMELEON_ERR_ILLEGAL_VALUE); */
-    /* } */
     if( (A->i != 0) || (A->j != 0) ||
         (B->i != 0) || (B->j != 0) ||
         (C->i != 0) || (C->j != 0) ) {
@@ -415,7 +589,21 @@ int CHAMELEON_zsymm_Tile_Async( cham_side_t side, cham_uplo_t uplo,
         return CHAMELEON_SUCCESS;
     }
 
-    chameleon_pzsymm( side, uplo, alpha, A, B, beta, C, sequence, request );
+    if ( user_ws == NULL ) {
+        ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, A, B, C );
+    }
+    else {
+        ws = user_ws;
+    }
 
+    chameleon_pzsymm( ws, side, uplo, alpha, A, B, beta, C, sequence, request );
+
+    if ( user_ws == NULL ) {
+        CHAMELEON_Desc_Flush( A, sequence );
+        CHAMELEON_Desc_Flush( B, sequence );
+        CHAMELEON_Desc_Flush( C, sequence );
+        chameleon_sequence_wait( chamctxt, sequence );
+        CHAMELEON_zsymm_WS_Free( ws );
+    }
     return CHAMELEON_SUCCESS;
 }
diff --git a/control/chameleon_zf77.c b/control/chameleon_zf77.c
index cf1b8d94ebd98bf55ae746818056e17a9e44d075..6132c2c9ec14cf6512a9f379c8a1dc3f2f14a534 100644
--- a/control/chameleon_zf77.c
+++ b/control/chameleon_zf77.c
@@ -757,7 +757,7 @@ void CHAMELEON_ZGETRS_NOPIV_TILE_ASYNC(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_s
 
 #if defined(PRECISION_z) || defined(PRECISION_c)
 void CHAMELEON_ZHEMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info)
-{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); }
+{ *info = CHAMELEON_zhemm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); }
 #endif
 
 #if defined(PRECISION_z) || defined(PRECISION_c)
@@ -840,7 +840,7 @@ void CHAMELEON_ZSYTRS_TILE_ASYNC(cham_uplo_t *uplo, CHAM_desc_t *A, CHAM_desc_t
 #endif
 
 void CHAMELEON_ZSYMM_TILE_ASYNC(cham_side_t *side, cham_uplo_t *uplo, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info)
-{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, sequence, request); }
+{ *info = CHAMELEON_zsymm_Tile_Async(*side, *uplo, *alpha, A, B, *beta, C, NULL, sequence, request); }
 
 void CHAMELEON_ZSYR2K_TILE_ASYNC(cham_uplo_t *uplo, cham_trans_t *trans, CHAMELEON_Complex64_t *alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t *beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request, int *info)
 { *info = CHAMELEON_zsyr2k_Tile_Async(*uplo, *trans, *alpha, A, B, *beta, C, sequence, request); }
diff --git a/control/compute_z.h b/control/compute_z.h
index 760127d31d5eab6d4339316c2888f36e6a535666..1ee61f62894310b1f41c442d47422c7729745e10 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -81,7 +81,7 @@ void chameleon_pzgetrf_nopiv(CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTI
 void chameleon_pzgetrf_reclap(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzgetrf_rectil(CHAM_desc_t *A, int *IPIV, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzhegst(int itype, cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-void chameleon_pzhemm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+void chameleon_pzhemm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzherk(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzher2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzhetrd_he2hb(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *T, CHAM_desc_t *E, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
@@ -106,7 +106,7 @@ void chameleon_pzplrnk(int K, CHAM_desc_t *C, unsigned long long int seedA, unsi
 void chameleon_pzpotrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzpotrimm(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzshift(int, int, int, CHAMELEON_Complex64_t *, int *, int, int, int, RUNTIME_sequence_t*, RUNTIME_request_t*);
-void chameleon_pzsymm(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+void chameleon_pzsymm( struct chameleon_pzgemm_s *ws,cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
 void chameleon_pzsyrk(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta,  CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzsyr2k(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 void chameleon_pzsytrf(cham_uplo_t uplo, CHAM_desc_t *A, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
diff --git a/include/chameleon/chameleon_z.h b/include/chameleon/chameleon_z.h
index 3a800ec85f90f186e2a802698d533b0adc073cbb..6e576d21b1e231fee7d66277cfc72311c14d093b 100644
--- a/include/chameleon/chameleon_z.h
+++ b/include/chameleon/chameleon_z.h
@@ -213,7 +213,7 @@ int CHAMELEON_zgetrf_nopiv_Tile_Async(CHAM_desc_t *A, RUNTIME_sequence_t *sequen
 //int CHAMELEON_zgetrs_Tile_Async(cham_trans_t trans, CHAM_desc_t *A, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zgetrs_incpiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *L, int *IPIV, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zgetrs_nopiv_Tile_Async(CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+int CHAMELEON_zhemm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zherk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, double alpha, CHAM_desc_t *A, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zher2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, double beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 //int CHAMELEON_zheev_Tile_Async(cham_job_t jobz, cham_uplo_t uplo, CHAM_desc_t *A, double *W, CHAM_desc_t *T, CHAMELEON_Complex64_t *Q, int LDQ, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
@@ -247,7 +247,7 @@ int CHAMELEON_zpotrimm_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t
 int CHAMELEON_zpotrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsysv_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsytrs_Tile_Async(cham_uplo_t uplo, CHAM_desc_t *A, CHAM_desc_t *B, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
-int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
+int CHAMELEON_zsymm_Tile_Async(cham_side_t side, cham_uplo_t uplo, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, void *ws, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsyrk_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_zsyr2k_Tile_Async(cham_uplo_t uplo, cham_trans_t trans, CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B, CHAMELEON_Complex64_t beta, CHAM_desc_t *C, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request);
 int CHAMELEON_ztpgqrt_Tile_Async( int L, CHAM_desc_t *V1, CHAM_desc_t *T1, CHAM_desc_t *V2, CHAM_desc_t *T2, CHAM_desc_t *Q1, CHAM_desc_t *Q2, RUNTIME_sequence_t *sequence, RUNTIME_request_t *request );
@@ -312,6 +312,10 @@ int CHAMELEON_zunmqr_param_Tile_Async(const libhqr_tree_t *qrtree, cham_side_t s
  */
 void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t transA, cham_trans_t transB, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C );
 void  CHAMELEON_zgemm_WS_Free( void *ws );
+void *CHAMELEON_zhemm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C );
+void  CHAMELEON_zhemm_WS_Free( void *ws );
+void *CHAMELEON_zsymm_WS_Alloc( cham_side_t side, cham_uplo_t uplo, const CHAM_desc_t *A, const CHAM_desc_t *B, const CHAM_desc_t *C );
+void  CHAMELEON_zsymm_WS_Free( void *ws );
 void *CHAMELEON_zcesca_WS_Alloc( const CHAM_desc_t *A );
 void  CHAMELEON_zcesca_WS_Free( void *ws );
 void *CHAMELEON_zgram_WS_Alloc( const CHAM_desc_t *A );
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index ae307f760eb78d5ec4b3fec9a41db694211382be..4576ba9e44671fa4964b988d9f9a73e6bb9df312 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -110,6 +110,12 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
+void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                              const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
 void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
                          int n, int k, int nb,
@@ -210,6 +216,12 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
+void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                              const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
 void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
                          int n, int k, int nb,
diff --git a/runtime/openmp/codelets/codelet_zhemm.c b/runtime/openmp/codelets/codelet_zhemm.c
index 79d36ff56f9c76074225a9b0438631236eab3b39..2d36c052c02632b2363f2ba6078e46839e4b84ea 100644
--- a/runtime/openmp/codelets/codelet_zhemm.c
+++ b/runtime/openmp/codelets/codelet_zhemm.c
@@ -20,23 +20,37 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_ztile.h"
 
-void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
     CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
     CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn );
 #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] )
     TCORE_zhemm( side, uplo,
-        m, n,
-        alpha, tileA,
-        tileB,
-        beta, tileC );
+                 m, n,
+                 alpha, tileA,
+                 tileB,
+                 beta, tileC );
 
     (void)options;
     (void)nb;
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/openmp/codelets/codelet_zsymm.c b/runtime/openmp/codelets/codelet_zsymm.c
index 90ea5d8490c741a38ff56536c32f25e37616f0c3..b89c945fb3cf882417048869873e71d33a21c3c6 100644
--- a/runtime/openmp/codelets/codelet_zsymm.c
+++ b/runtime/openmp/codelets/codelet_zsymm.c
@@ -20,23 +20,37 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_ztile.h"
 
-void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
+void
+INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
     CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
     CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn );
 #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] )
     TCORE_zsymm( side, uplo,
-        m, n,
-        alpha, tileA,
-        tileB,
-        beta, tileC );
+                 m, n,
+                 alpha, tileA,
+                 tileB,
+                 beta, tileC );
 
     (void)options;
     (void)nb;
 }
+
+void
+INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/parsec/codelets/codelet_zgemm.c b/runtime/parsec/codelets/codelet_zgemm.c
index 5a0938ed97e3dc116c18007b55897ddad0253c75..b831ebd09b00ceb5257e09fba345b851252edf26 100644
--- a/runtime/parsec/codelets/codelet_zgemm.c
+++ b/runtime/parsec/codelets/codelet_zgemm.c
@@ -23,11 +23,6 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_z.h"
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
 static inline int
 CORE_zgemm_parsec( parsec_execution_stream_t *context,
                    parsec_task_t             *this_task )
@@ -50,20 +45,21 @@ CORE_zgemm_parsec( parsec_execution_stream_t *context,
         this_task, &transA, &transB, &m, &n, &k, &alpha, &A, &lda, &B, &ldb, &beta, &C, &ldc );
 
     CORE_zgemm( transA, transB, m, n, k,
-               alpha, A, lda,
-                      B, ldb,
-               beta,  C, ldc);
+                alpha, A, lda,
+                       B, ldb,
+                beta,  C, ldc );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
-                       cham_trans_t transA, cham_trans_t transB,
-                       int m, int n, int k, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+void
+INSERT_TASK_zgemm( const RUNTIME_option_t *options,
+                   cham_trans_t transA, cham_trans_t transB,
+                   int m, int n, int k, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                                                 const CHAM_desc_t *B, int Bm, int Bn,
-                       CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
@@ -71,7 +67,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn );
 
     parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_zgemm_parsec, options->priority, "Gemm",
+        PARSEC_dtd_taskpool, CORE_zgemm_parsec, options->priority, "gemm",
         sizeof(cham_trans_t),    &transA,                           VALUE,
         sizeof(cham_trans_t),    &transB,                           VALUE,
         sizeof(int),           &m,                                VALUE,
@@ -95,7 +91,7 @@ INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                          cham_trans_t transA, cham_trans_t transB,
                          int m, int n, int k, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                         const CHAM_desc_t *B, int Bm, int Bn,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
diff --git a/runtime/parsec/codelets/codelet_zhemm.c b/runtime/parsec/codelets/codelet_zhemm.c
index a1c398308c514eaaea41be192c061bc42b09ac09..62ecccc26161921e93e832c6773a83284f6d76dc 100644
--- a/runtime/parsec/codelets/codelet_zhemm.c
+++ b/runtime/parsec/codelets/codelet_zhemm.c
@@ -43,20 +43,21 @@ CORE_zhemm_parsec( parsec_execution_stream_t *context,
         this_task, &side, &uplo, &M, &N, &alpha, &A, &LDA, &B, &LDB, &beta, &C, &LDC );
 
     CORE_zhemm( side, uplo, M, N,
-               alpha, A, LDA,
-                      B, LDB,
-               beta,  C, LDC);
+                alpha, A, LDA,
+                       B, LDB,
+                beta,  C, LDC );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
@@ -81,3 +82,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/parsec/codelets/codelet_zsymm.c b/runtime/parsec/codelets/codelet_zsymm.c
index cf8fc50cd9c3a2f3718e3be355a5ee731e65a656..d2868d4c11b2d3c9ee482ea4f1470616ab1cf908 100644
--- a/runtime/parsec/codelets/codelet_zsymm.c
+++ b/runtime/parsec/codelets/codelet_zsymm.c
@@ -24,7 +24,7 @@
 
 static inline int
 CORE_zsymm_parsec( parsec_execution_stream_t *context,
-                    parsec_task_t             *this_task )
+                   parsec_task_t             *this_task )
 {
     cham_side_t side;
     cham_uplo_t uplo;
@@ -45,18 +45,19 @@ CORE_zsymm_parsec( parsec_execution_stream_t *context,
     CORE_zsymm( side, uplo, M, N,
                 alpha, A, LDA,
                        B, LDB,
-                beta,  C, LDC);
+                beta,  C, LDC );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
@@ -81,3 +82,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+void
+INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c
index d854c638a2f01e68018ca0973e67ce8a4d0604a6..1eb63bb7bfd66ec2a667357a1b4efcd7b2550da7 100644
--- a/runtime/quark/codelets/codelet_zgemm.c
+++ b/runtime/quark/codelets/codelet_zgemm.c
@@ -46,12 +46,13 @@ void CORE_zgemm_quark(Quark *quark)
                  beta,  tileC );
 }
 
-void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
-                      cham_trans_t transA, cham_trans_t transB,
-                      int m, int n, int k, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                                                   const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zgemm( const RUNTIME_option_t *options,
+                   cham_trans_t transA, cham_trans_t transB,
+                   int m, int n, int k, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
@@ -81,7 +82,7 @@ INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                          cham_trans_t transA, cham_trans_t transB,
                          int m, int n, int k, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                         const CHAM_desc_t *B, int Bm, int Bn,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c
index 9ab119755135f795ebc39031bd08795f5a59b07b..917376cedfdea194ae2c789808e78a92dad1f610 100644
--- a/runtime/quark/codelets/codelet_zhemm.c
+++ b/runtime/quark/codelets/codelet_zhemm.c
@@ -30,28 +30,28 @@ void CORE_zhemm_quark(Quark *quark)
 {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
 
-    quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC);
-    TCORE_zhemm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC);
+    TCORE_zhemm( side, uplo,
+                 m, n,
+                 alpha, tileA, tileB,
+                 beta, tileC );
 }
 
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
@@ -74,3 +74,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               accessC,
         0);
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/quark/codelets/codelet_zsymm.c b/runtime/quark/codelets/codelet_zsymm.c
index 6a57a49f9429384c38b6c03ce93984f503a3c744..a1cf3d43150d6faab68001cfdaffeba009120f40 100644
--- a/runtime/quark/codelets/codelet_zsymm.c
+++ b/runtime/quark/codelets/codelet_zsymm.c
@@ -30,28 +30,28 @@ void CORE_zsymm_quark(Quark *quark)
 {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
 
-    quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC);
-    TCORE_zsymm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC);
+    TCORE_zsymm( side, uplo,
+                 m, n,
+                 alpha, tileA, tileB,
+                 beta, tileC );
 }
 
-void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
@@ -74,3 +74,16 @@ void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               accessC,
         0);
 }
+
+void
+INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index bf72c91c2cd697268438ac9641f977be4b77cd51..9e1574d7abd7fe95b0c47904aa44f83744f0e9a1 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -87,8 +87,6 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg )
         (cuDoubleComplex*)&(clargs->beta),
         tileC->mat, tileC->ld,
         handle );
-
-    return;
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -182,10 +180,7 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-
         0 );
-
-    (void)nb;
 }
 
 void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
@@ -260,8 +255,5 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
-
         0 );
-
-    (void)nb;
 }
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index c05dbb92d53962d337b995f3725f02aeae19e31f..4451431f4c075feca2d6d6f6170098be3cf1d3cc 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -26,111 +26,229 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
-{
+struct cl_zhemm_args_s {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
+};
+
+#if !defined(CHAMELEON_SIMULATION)
+static void
+cl_zhemm_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
-    TCORE_zhemm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    TCORE_zhemm( clargs->side, clargs->uplo,
+                 clargs->m, clargs->n,
+                 clargs->alpha, tileA, tileB,
+                 clargs->beta,  tileC );
 }
 
 #ifdef CHAMELEON_USE_CUDA
-static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
+static void
+cl_zhemm_cuda_func( void *descr[], void *cl_arg )
 {
-    cublasHandle_t handle = starpu_cublas_get_local_handle();
-    cham_side_t side;
-    cham_uplo_t uplo;
-    int M;
-    int N;
-    cuDoubleComplex alpha;
+    struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    cuDoubleComplex beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
 
     CUDA_zhemm(
-        side, uplo,
-        M, N,
-        &alpha, tileA->mat, tileA->ld,
-                tileB->mat, tileB->ld,
-        &beta,  tileC->mat, tileC->ld,
+        clargs->side, clargs->uplo,
+        clargs->m, clargs->n,
+        (cuDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (cuDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
         handle );
 }
-#endif /* CHAMELEON_USE_CUDA */
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS(zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC)
+CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC )
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
+    struct cl_zhemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec    = 0;
+    char                    *cl_name = "zhemm_Astat";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+     /* Check A as write, since it will be the owner of the computation */
+    CHAMELEON_ACCESS_W(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+     /* Check C as read, since it will be used in a reduction */
+    CHAMELEON_ACCESS_R(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zhemm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zhemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    if ( beta == 0. ) {
+        accessC = STARPU_W;
+    }
+#if defined(HAVE_STARPU_MPI_REDUX)
+    else if ( beta == 1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+    else {
+        accessC = STARPU_RW;
+    }
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_zhemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+        0 );
+}
+
+void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
                                     beta, C, Cm, Cn );
     }
 
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zhemm;
-    void (*callback)(void*) = options->profiling ? cl_zhemm_callback : NULL;
-    int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    struct cl_zhemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec = 0;
+    char                    *cl_name = "zhemm";
 
+    /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zhemm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zhemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &side,                sizeof(int),
-        STARPU_VALUE,    &uplo,                sizeof(int),
-        STARPU_VALUE,       &m,                        sizeof(int),
-        STARPU_VALUE,       &n,                        sizeof(int),
-        STARPU_VALUE,   &alpha,         sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,               RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_R,               RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
-        STARPU_VALUE,    &beta,         sizeof(CHAMELEON_Complex64_t),
-        accessC,                RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
-        STARPU_PRIORITY,    options->priority,
-        STARPU_CALLBACK,    callback,
+        &cl_zhemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zhemm",
+        STARPU_NAME,              cl_name,
 #endif
-        0);
+        0 );
 }
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index e1507f65791c4fe99dfab487a5f0ae2a2a05e96b..62b8de470b7c36f0cb82ecf826c0444707289d71 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -26,111 +26,229 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zsymm_cpu_func(void *descr[], void *cl_arg)
-{
+struct cl_zsymm_args_s {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
+};
+
+#if !defined(CHAMELEON_SIMULATION)
+static void
+cl_zsymm_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
-    TCORE_zsymm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    TCORE_zsymm( clargs->side, clargs->uplo,
+                 clargs->m, clargs->n,
+                 clargs->alpha, tileA, tileB,
+                 clargs->beta,  tileC );
 }
 
 #ifdef CHAMELEON_USE_CUDA
-static void cl_zsymm_cuda_func(void *descr[], void *cl_arg)
+static void
+cl_zsymm_cuda_func( void *descr[], void *cl_arg )
 {
-    cublasHandle_t handle = starpu_cublas_get_local_handle();
-    cham_side_t side;
-    cham_uplo_t uplo;
-    int M;
-    int N;
-    cuDoubleComplex alpha;
+    struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    cuDoubleComplex beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
 
     CUDA_zsymm(
-        side, uplo,
-        M, N,
-        &alpha, tileA->mat, tileA->ld,
-                tileB->mat, tileB->ld,
-        &beta,  tileC->mat, tileC->ld,
+        clargs->side, clargs->uplo,
+        clargs->m, clargs->n,
+        (cuDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (cuDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
         handle );
 }
-#endif /* CHAMELEON_USE_CUDA */
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS(zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC)
+CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC )
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
-void INSERT_TASK_zsymm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
+    struct cl_zsymm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec    = 0;
+    char                    *cl_name = "zsymm_Astat";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+     /* Check A as write, since it will be the owner of the computation */
+    CHAMELEON_ACCESS_W(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+     /* Check C as read, since it will be used in a reduction */
+    CHAMELEON_ACCESS_R(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zsymm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zsymm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    if ( beta == 0. ) {
+        accessC = STARPU_W;
+    }
+#if defined(HAVE_STARPU_MPI_REDUX)
+    else if ( beta == 1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+    else {
+        accessC = STARPU_RW;
+    }
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_zsymm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+        0 );
+}
+
+void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
                                     beta, C, Cm, Cn );
     }
 
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zsymm;
-    void (*callback)(void*) = options->profiling ? cl_zsymm_callback : NULL;
-    int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    struct cl_zsymm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec = 0;
+    char                    *cl_name = "zsymm";
 
+    /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zsymm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zsymm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &side,                sizeof(int),
-        STARPU_VALUE,    &uplo,                sizeof(int),
-        STARPU_VALUE,       &m,                        sizeof(int),
-        STARPU_VALUE,       &n,                        sizeof(int),
-        STARPU_VALUE,   &alpha,         sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,               RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_R,               RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
-        STARPU_VALUE,    &beta,         sizeof(CHAMELEON_Complex64_t),
-        accessC,                RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
-        STARPU_PRIORITY,    options->priority,
-        STARPU_CALLBACK,    callback,
+        &cl_zsymm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zsymm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zsymm",
+        STARPU_NAME,              cl_name,
 #endif
-        0);
+        0 );
 }
diff --git a/testing/testing_zhemm.c b/testing/testing_zhemm.c
index 38d91da96d7d96794e665a0551d196bd055e8779..e80471c8fd687bb301f5b703442f17a6ce9e2366 100644
--- a/testing/testing_zhemm.c
+++ b/testing/testing_zhemm.c
@@ -57,6 +57,7 @@ testing_zhemm_desc( run_arg_list_t *args, int check )
     /* Descriptors */
     int          Am;
     CHAM_desc_t *descA, *descB, *descC, *descCinit;
+    void        *ws = NULL;
 
     bump  = run_arg_get_double( args, "bump", bump );
     alpha = run_arg_get_complex64( args, "alpha", alpha );
@@ -85,11 +86,15 @@ testing_zhemm_desc( run_arg_list_t *args, int check )
     CHAMELEON_zplrnt_Tile( descB, seedB );
     CHAMELEON_zplrnt_Tile( descC, seedC );
 
+    if ( async ) {
+        ws = CHAMELEON_zhemm_WS_Alloc( side, uplo, descA, descB, descC );
+    }
+
     /* Calculates the product */
     testing_start( &test_data );
     if ( async ) {
         hres = CHAMELEON_zhemm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC,
-                                           test_data.sequence, &test_data.request );
+                                           ws, test_data.sequence, &test_data.request );
         CHAMELEON_Desc_Flush( descA, test_data.sequence );
         CHAMELEON_Desc_Flush( descB, test_data.sequence );
         CHAMELEON_Desc_Flush( descC, test_data.sequence );
@@ -100,6 +105,10 @@ testing_zhemm_desc( run_arg_list_t *args, int check )
     test_data.hres = hres;
     testing_stop( &test_data, flops_zhemm( side, M, N ) );
 
+    if ( ws != NULL ) {
+        CHAMELEON_zhemm_WS_Free( ws );
+    }
+
     /* Checks the solution */
     if ( check ) {
         CHAMELEON_Desc_Create(
diff --git a/testing/testing_zsymm.c b/testing/testing_zsymm.c
index b4edb1e840c4d095e6832eba9ea7759d9810c0db..39f9663010ec11b9cb4f608eb99ae0366dbce624 100644
--- a/testing/testing_zsymm.c
+++ b/testing/testing_zsymm.c
@@ -57,6 +57,7 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     /* Descriptors */
     int          Am;
     CHAM_desc_t *descA, *descB, *descC, *descCinit;
+    void        *ws = NULL;
 
     bump  = run_arg_get_double( args, "bump", bump );
     alpha = run_arg_get_complex64( args, "alpha", alpha );
@@ -85,11 +86,15 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     CHAMELEON_zplrnt_Tile( descB, seedB );
     CHAMELEON_zplrnt_Tile( descC, seedC );
 
+    if ( async ) {
+        ws = CHAMELEON_zsymm_WS_Alloc( side, uplo, descA, descB, descC );
+    }
+
     /* Calculates the product */
     testing_start( &test_data );
     if ( async ) {
         hres = CHAMELEON_zsymm_Tile_Async( side, uplo, alpha, descA, descB, beta, descC,
-                                           test_data.sequence, &test_data.request );
+                                           ws, test_data.sequence, &test_data.request );
         CHAMELEON_Desc_Flush( descA, test_data.sequence );
         CHAMELEON_Desc_Flush( descB, test_data.sequence );
         CHAMELEON_Desc_Flush( descC, test_data.sequence );
@@ -100,6 +105,10 @@ testing_zsymm_desc( run_arg_list_t *args, int check )
     test_data.hres = hres;
     testing_stop( &test_data, flops_zsymm( side, M, N ) );
 
+    if ( ws != NULL ) {
+        CHAMELEON_zsymm_WS_Free( ws );
+    }
+
     /* Checks the solution */
     if ( check ) {
         CHAMELEON_Desc_Create(