diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 0403d1c8d6b898ba41eb6c57bb423b3dc65c28fa..4576ba9e44671fa4964b988d9f9a73e6bb9df312 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -110,6 +110,12 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
+void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                              const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
 void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
                          int n, int k, int nb,
diff --git a/runtime/openmp/codelets/codelet_zhemm.c b/runtime/openmp/codelets/codelet_zhemm.c
index 79d36ff56f9c76074225a9b0438631236eab3b39..2d36c052c02632b2363f2ba6078e46839e4b84ea 100644
--- a/runtime/openmp/codelets/codelet_zhemm.c
+++ b/runtime/openmp/codelets/codelet_zhemm.c
@@ -20,23 +20,37 @@
 #include "chameleon/tasks_z.h"
 #include "coreblas/coreblas_ztile.h"
 
-void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn )
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
     CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
     CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn );
 #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] )
     TCORE_zhemm( side, uplo,
-        m, n,
-        alpha, tileA,
-        tileB,
-        beta, tileC );
+                 m, n,
+                 alpha, tileA,
+                 tileB,
+                 beta, tileC );
 
     (void)options;
     (void)nb;
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/parsec/codelets/codelet_zhemm.c b/runtime/parsec/codelets/codelet_zhemm.c
index a1c398308c514eaaea41be192c061bc42b09ac09..62ecccc26161921e93e832c6773a83284f6d76dc 100644
--- a/runtime/parsec/codelets/codelet_zhemm.c
+++ b/runtime/parsec/codelets/codelet_zhemm.c
@@ -43,20 +43,21 @@ CORE_zhemm_parsec( parsec_execution_stream_t *context,
         this_task, &side, &uplo, &M, &N, &alpha, &A, &LDA, &B, &LDB, &beta, &C, &LDC );
 
     CORE_zhemm( side, uplo, M, N,
-               alpha, A, LDA,
-                      B, LDB,
-               beta,  C, LDC);
+                alpha, A, LDA,
+                       B, LDB,
+                beta,  C, LDC );
 
     (void)context;
     return PARSEC_HOOK_RETURN_DONE;
 }
 
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
     CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
@@ -81,3 +82,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c
index 9ab119755135f795ebc39031bd08795f5a59b07b..917376cedfdea194ae2c789808e78a92dad1f610 100644
--- a/runtime/quark/codelets/codelet_zhemm.c
+++ b/runtime/quark/codelets/codelet_zhemm.c
@@ -30,28 +30,28 @@ void CORE_zhemm_quark(Quark *quark)
 {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
 
-    quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC);
-    TCORE_zhemm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC);
+    TCORE_zhemm( side, uplo,
+                 m, n,
+                 alpha, tileA, tileB,
+                 beta, tileC );
 }
 
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void
+INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                   cham_side_t side, cham_uplo_t uplo,
+                   int m, int n, int nb,
+                   CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                const CHAM_desc_t *B, int Bm, int Bn,
+                   CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
@@ -74,3 +74,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
         sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),               accessC,
         0);
 }
+
+void
+INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                         cham_side_t side, cham_uplo_t uplo,
+                         int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index c05dbb92d53962d337b995f3725f02aeae19e31f..4451431f4c075feca2d6d6f6170098be3cf1d3cc 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -26,111 +26,229 @@
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-#if !defined(CHAMELEON_SIMULATION)
-static void cl_zhemm_cpu_func(void *descr[], void *cl_arg)
-{
+struct cl_zhemm_args_s {
     cham_side_t side;
     cham_uplo_t uplo;
-    int M;
-    int N;
+    int m;
+    int n;
     CHAMELEON_Complex64_t alpha;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
+};
+
+#if !defined(CHAMELEON_SIMULATION)
+static void
+cl_zhemm_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+    CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
-    TCORE_zhemm(side, uplo,
-        M, N,
-        alpha, tileA,
-        tileB,
-        beta, tileC);
+    TCORE_zhemm( clargs->side, clargs->uplo,
+                 clargs->m, clargs->n,
+                 clargs->alpha, tileA, tileB,
+                 clargs->beta,  tileC );
 }
 
 #ifdef CHAMELEON_USE_CUDA
-static void cl_zhemm_cuda_func(void *descr[], void *cl_arg)
+static void
+cl_zhemm_cuda_func( void *descr[], void *cl_arg )
 {
-    cublasHandle_t handle = starpu_cublas_get_local_handle();
-    cham_side_t side;
-    cham_uplo_t uplo;
-    int M;
-    int N;
-    cuDoubleComplex alpha;
+    struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg;
+    cublasHandle_t          handle = starpu_cublas_get_local_handle();
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    cuDoubleComplex beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta);
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
 
     CUDA_zhemm(
-        side, uplo,
-        M, N,
-        &alpha, tileA->mat, tileA->ld,
-                tileB->mat, tileB->ld,
-        &beta,  tileC->mat, tileC->ld,
+        clargs->side, clargs->uplo,
+        clargs->m, clargs->n,
+        (cuDoubleComplex*)&(clargs->alpha),
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        (cuDoubleComplex*)&(clargs->beta),
+        tileC->mat, tileC->ld,
         handle );
 }
-#endif /* CHAMELEON_USE_CUDA */
+#endif /* defined(CHAMELEON_USE_CUDA) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS(zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC)
+CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC )
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
-void INSERT_TASK_zhemm(const RUNTIME_option_t *options,
-                      cham_side_t side, cham_uplo_t uplo,
-                      int m, int n, int nb,
-                      CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                      const CHAM_desc_t *B, int Bm, int Bn,
-                      CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
+    struct cl_zhemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec    = 0;
+    char                    *cl_name = "zhemm_Astat";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+     /* Check A as write, since it will be the owner of the computation */
+    CHAMELEON_ACCESS_W(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+     /* Check C as read, since it will be used in a reduction */
+    CHAMELEON_ACCESS_R(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zhemm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zhemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    if ( beta == 0. ) {
+        accessC = STARPU_W;
+    }
+#if defined(HAVE_STARPU_MPI_REDUX)
+    else if ( beta == 1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+    else {
+        accessC = STARPU_RW;
+    }
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_zhemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+        0 );
+}
+
+void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
                                     beta, C, Cm, Cn );
     }
 
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zhemm;
-    void (*callback)(void*) = options->profiling ? cl_zhemm_callback : NULL;
-    int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    struct cl_zhemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec = 0;
+    char                    *cl_name = "zhemm";
 
+    /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zhemm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zhemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0));
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &side,                sizeof(int),
-        STARPU_VALUE,    &uplo,                sizeof(int),
-        STARPU_VALUE,       &m,                        sizeof(int),
-        STARPU_VALUE,       &n,                        sizeof(int),
-        STARPU_VALUE,   &alpha,         sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,               RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_R,               RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
-        STARPU_VALUE,    &beta,         sizeof(CHAMELEON_Complex64_t),
-        accessC,                RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
-        STARPU_PRIORITY,    options->priority,
-        STARPU_CALLBACK,    callback,
+        &cl_zhemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, "zhemm",
+        STARPU_NAME,              cl_name,
 #endif
-        0);
+        0 );
 }