diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index 34a5bc904d05debd04053b4452611b87df3d94be..0f404e1a52472500e2ada61ac251bff26bbb7c6d 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -41,6 +41,7 @@ _extra_blas = [
     ('',                     'sgepdf',               'dgepdf',               'cgepdf',               'zgepdf'              ),
     ('',                     'scesca',               'dcesca',               'ccesca',               'zcesca'              ),
     ('',                     'sgesum',               'dgesum',               'cgesum',               'zgesum'              ),
+    ('',                     'sgersum',              'dgersum',              'cgersum',              'zgersum'             ),
 ]
 
 _extra_BLAS = [ [ x.upper() for x in row ] for row in _extra_blas ]
diff --git a/compute/pzgemm.c b/compute/pzgemm.c
index 66a79bcde0a26f21c9e18f5827695a7b36e4fbb6..4c41e64438be82566417209028177a56ac26950a 100644
--- a/compute/pzgemm.c
+++ b/compute/pzgemm.c
@@ -30,6 +30,148 @@
 #define WA(m, n) WA,  m,  n
 #define WB(m, n) WB,  m,  n
 
+/**
+ *  Parallel tile matrix-matrix multiplication.
+ *  Generic algorithm for any data distribution.
+ */
+static inline void
+chameleon_pzgemm_Astat( CHAM_context_t *chamctxt, cham_trans_t transA, cham_trans_t transB,
+                        CHAMELEON_Complex64_t alpha, CHAM_desc_t *A, CHAM_desc_t *B,
+                        CHAMELEON_Complex64_t beta,  CHAM_desc_t *C,
+                        RUNTIME_option_t *options )
+{
+    const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
+    RUNTIME_sequence_t *sequence = options->sequence;
+    int                 m, n, k;
+    int                 tempmm, tempnn, tempkn, tempkm;
+    int                 myrank = RUNTIME_comm_rank( chamctxt );
+    int                 reduceC[ C->mt * C->nt ];
+
+    /* Set C tiles to redux mode. */
+    for (n = 0; n < C->nt; n++) {
+        for (m = 0; m < C->mt; m++) {
+            reduceC[ n * C->mt + m ] = 0;
+
+            /* The node owns the C tile. */
+            if ( C->get_rankof( C(m, n) ) == myrank ) {
+                reduceC[ n * C->mt + m ] = 1;
+                RUNTIME_zgersum_set_methods( C(m, n) );
+                continue;
+            }
+
+            /*
+             * The node owns the A tile that will define the locality of the
+             * computations.
+             */
+            if ( transA == ChamNoTrans ) {
+                for (k = 0; k < A->nt; k++) {
+                    if ( A->get_rankof( A(m, k) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+            else {
+                for (k = 0; k < A->mt; k++) {
+                    if ( A->get_rankof( A(k, m) ) == myrank ) {
+                        reduceC[ n * C->mt + m ] = 1;
+                        RUNTIME_zgersum_set_methods( C(m, n) );
+                        break;
+                    }
+                }
+            }
+        }
+    }
+
+    for (n = 0; n < C->nt; n++) {
+        tempnn = n == C->nt-1 ? C->n-n*C->nb : C->nb;
+        for (m = 0; m < C->mt; m++) {
+            tempmm = m == C->mt-1 ? C->m-m*C->mb : C->mb;
+
+            /* Scale C */
+            options->forcesub = 0;
+            INSERT_TASK_zlascal( options, ChamUpperLower, tempmm, tempnn, C->mb,
+                                 beta, C, m, n );
+            options->forcesub = reduceC[ n * C->mt + m ];
+
+            /*
+             *  A: ChamNoTrans / B: ChamNoTrans
+             */
+            if (transA == ChamNoTrans) {
+                if (transB == ChamNoTrans) {
+                    for (k = 0; k < A->nt; k++) {
+                        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+
+                        INSERT_TASK_zgemm_Astat(
+                            options,
+                            transA, transB,
+                            tempmm, tempnn, tempkn, A->mb,
+                            alpha, A(m, k),  /* lda * Z */
+                                   B(k, n),  /* ldb * Y */
+                            zone,  C(m, n)); /* ldc * Y */
+                    }
+                }
+                /*
+                 *  A: ChamNoTrans / B: Cham[Conj]Trans
+                 */
+                else {
+                    for (k = 0; k < A->nt; k++) {
+                        tempkn = k == A->nt-1 ? A->n-k*A->nb : A->nb;
+
+                        INSERT_TASK_zgemm_Astat(
+                            options,
+                            transA, transB,
+                            tempmm, tempnn, tempkn, A->mb,
+                            alpha, A(m, k),  /* lda * Z */
+                                   B(n, k),  /* ldb * Z */
+                            zone,  C(m, n)); /* ldc * Y */
+                    }
+                }
+            }
+            /*
+             *  A: Cham[Conj]Trans / B: ChamNoTrans
+             */
+            else {
+                if (transB == ChamNoTrans) {
+                    for (k = 0; k < A->mt; k++) {
+                        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+
+                        INSERT_TASK_zgemm_Astat(
+                            options,
+                            transA, transB,
+                            tempmm, tempnn, tempkm, A->mb,
+                            alpha, A(k, m),  /* lda * X */
+                                   B(k, n),  /* ldb * Y */
+                            zone,  C(m, n)); /* ldc * Y */
+                    }
+                }
+                /*
+                 *  A: Cham[Conj]Trans / B: Cham[Conj]Trans
+                 */
+                else {
+                    for (k = 0; k < A->mt; k++) {
+                        tempkm = k == A->mt-1 ? A->m-k*A->mb : A->mb;
+
+                        INSERT_TASK_zgemm_Astat(
+                            options,
+                            transA, transB,
+                            tempmm, tempnn, tempkm, A->mb,
+                            alpha, A(k, m),  /* lda * X */
+                                   B(n, k),  /* ldb * Z */
+                            zone,  C(m, n)); /* ldc * Y */
+                    }
+                }
+            }
+            RUNTIME_zgersum_submit_tree( options, C(m, n) );
+            RUNTIME_data_flush( sequence, C(m, n) );
+        }
+    }
+    options->forcesub = 0;
+
+    (void)chamctxt;
+}
+
 /**
  *  Parallel tile matrix-matrix multiplication
  *  SUMMA algorithm for 2D block-cyclic data distribution.
@@ -284,6 +426,7 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws,
 {
     CHAM_context_t *chamctxt;
     RUNTIME_option_t options;
+    cham_gemm_t alg = (ws != NULL) ? ws->alg : ChamGemmAlgGeneric;
 
     chamctxt = chameleon_context_self();
     if (sequence->status != CHAMELEON_SUCCESS) {
@@ -291,13 +434,21 @@ chameleon_pzgemm( struct chameleon_pzgemm_s *ws,
     }
     RUNTIME_options_init( &options, chamctxt, sequence, request );
 
-    if ( ws && ws->summa )
-    {
+    switch( alg ) {
+    case ChamGemmAlgAuto:
+    case ChamGemmAlgSummaB: /* Switch back to generic since it does not exist yet. */
+    case ChamGemmAlgGeneric:
+        chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
+        break;
+
+    case ChamGemmAlgSummaC:
         chameleon_pzgemm_summa( chamctxt, transA, transB, alpha, A, B, beta, C,
                                 &(ws->WA), &(ws->WB), &options );
-    }
-    else {
-        chameleon_pzgemm_generic( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
+        break;
+
+    case ChamGemmAlgSummaA:
+        chameleon_pzgemm_Astat( chamctxt, transA, transB, alpha, A, B, beta, C, &options );
+        break;
     }
 
     RUNTIME_options_finalize( &options, chamctxt );
diff --git a/compute/pzgepdf_qdwh.c b/compute/pzgepdf_qdwh.c
index 48617938bb3c59e4c75716e5986d888cf3aedd1d..b50edf517487e469b41f59d3113408d631cea0f6 100644
--- a/compute/pzgepdf_qdwh.c
+++ b/compute/pzgepdf_qdwh.c
@@ -175,7 +175,7 @@ chameleon_pzgepdf_qdwh_init( const CHAM_desc_t *U, const CHAM_desc_t *H,
     /*
      * Allocate the data descriptors for the lookahead if needed
      */
-    *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, NULL, NULL, U );
+    *gemm_ws = CHAMELEON_zgemm_WS_Alloc( ChamNoTrans, ChamNoTrans, U, U, U );
 
     return;
 }
diff --git a/compute/zgemm.c b/compute/zgemm.c
index e3eed82ec66a299b83c74bbdc274ad867950bc2f..325f27cd90dc3cbf8dee13f26020f613e98a501a 100644
--- a/compute/zgemm.c
+++ b/compute/zgemm.c
@@ -103,14 +103,77 @@ void *CHAMELEON_zgemm_WS_Alloc( cham_trans_t       transA __attribute__((unused)
     }
 
     options = calloc( 1, sizeof(struct chameleon_pzgemm_s) );
-    options->summa = 0;
+    options->alg = ChamGemmAlgAuto;
+
+    /*
+     * If only one process, or if generic has been globally enforced, we switch
+     * to generic immediately.
+     */
+    if ( ((C->p == 1) && (C->q == 1)) ||
+         (chamctxt->generic_enabled == CHAMELEON_TRUE) )
+    {
+        options->alg = ChamGemmAlgGeneric;
+    }
 
-    if ( ((C->p > 1) || (C->q > 1)) &&
-         (C->get_rankof == chameleon_getrankof_2d) &&
-         (chamctxt->generic_enabled != CHAMELEON_TRUE) )
+    /* Look at environment variable is something enforces the variant. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        char *algostr = chameleon_getenv( "CHAMELEON_GEMM_ALGO" );
+
+        if ( algostr ) {
+            if ( strcasecmp( algostr, "summa_c" ) == 0 ) {
+                options->alg = ChamGemmAlgSummaC;
+            }
+            else if ( strcasecmp( algostr, "summa_a" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else if ( strcasecmp( algostr, "summa_b" ) == 0  ) {
+                options->alg = ChamGemmAlgSummaB;
+            }
+            else if ( strcasecmp( algostr, "generic" ) == 0  ) {
+                options->alg = ChamGemmAlgGeneric;
+            }
+            else if ( strcasecmp( algostr, "auto" ) == 0  ) {
+                options->alg = ChamGemmAlgAuto;
+            }
+            else {
+                fprintf( stderr, "ERROR: CHAMELEON_GEMM_ALGO is not one of AUTO, SUMMA_A, SUMMA_B, SUMMA_C, GENERIC => Switch back to Automatic switch\n" );
+            }
+        }
+        chameleon_cleanenv( algostr );
+    }
+
+    /* Perform automatic choice if not already enforced. */
+    if ( options->alg == ChamGemmAlgAuto )
+    {
+        double sizeA, sizeB, sizeC;
+        double ratio = 1.5; /* Arbitrary ratio to give more weight to writes wrt reads. */
+
+        /* Compute the average array per node for each matrix */
+        sizeA = ((double)A->m * (double)A->n) / (double)(A->p * A->q);
+        sizeB = ((double)B->m * (double)B->n) / (double)(B->p * B->q);
+        sizeC = ((double)C->m * (double)C->n) / (double)(C->p * C->q) * ratio;
+
+        if ( (sizeC > sizeA) && (sizeC > sizeB) ) {
+            options->alg = ChamGemmAlgSummaC;
+        }
+        else {
+            if ( sizeA > sizeB ) {
+                options->alg = ChamGemmAlgSummaA;
+            }
+            else {
+                options->alg = ChamGemmAlgSummaB;
+            }
+        }
+    }
+
+    assert( options->alg != ChamGemmAlgAuto );
+
+    /* Now that we have decided which algorithm, let's allocate the required data structures. */
+    if ( (options->alg == ChamGemmAlgSummaC ) &&
+         (C->get_rankof == chameleon_getrankof_2d ) )
     {
         int lookahead = chamctxt->lookahead;
-        options->summa = 1;
 
         chameleon_desc_init( &(options->WA), CHAMELEON_MAT_ALLOC_TILE,
                              ChamComplexDouble, C->mb, C->nb, (C->mb * C->nb),
@@ -150,7 +213,7 @@ void CHAMELEON_zgemm_WS_Free( void *user_ws )
 {
     struct chameleon_pzgemm_s *ws = (struct chameleon_pzgemm_s*)user_ws;
 
-    if ( ws->summa ) {
+    if ( ws->alg == ChamGemmAlgSummaC ) {
         chameleon_desc_destroy( &(ws->WA) );
         chameleon_desc_destroy( &(ws->WB) );
     }
diff --git a/control/compute_z.h b/control/compute_z.h
index d7953e9045a3e967781e8985f2ed851f2a529eae..760127d31d5eab6d4339316c2888f36e6a535666 100644
--- a/control/compute_z.h
+++ b/control/compute_z.h
@@ -30,7 +30,7 @@
  * @brief Data structure to handle the GEMM workspaces
  */
 struct chameleon_pzgemm_s {
-    int summa;
+    cham_gemm_t alg;
     CHAM_desc_t WA;
     CHAM_desc_t WB;
 };
diff --git a/control/descriptor.h b/control/descriptor.h
index 42669511a0afcdc6b9e4b98b7a43042e05d5d84d..47f0edbdfa270ad988609cc1baffbd542f5c4a93 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -233,9 +233,9 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n )
  * CHAMELEON_ACCESS_RW(C, Cm, Cn)
  * CHAMELEON_END_ACCESS_DECLARATION
  */
-#define CHAMELEON_BEGIN_ACCESS_DECLARATION {    \
-    unsigned __chameleon_need_exec = 0;         \
-    unsigned __chameleon_need_submit = 0;       \
+#define CHAMELEON_BEGIN_ACCESS_DECLARATION {                    \
+    unsigned __chameleon_need_exec = 0;                         \
+    unsigned __chameleon_need_submit = options->forcesub;       \
     RUNTIME_BEGIN_ACCESS_DECLARATION
 
 #define CHAMELEON_ACCESS_R(A, Am, An) do {                              \
diff --git a/doc/user/chapters/using.org b/doc/user/chapters/using.org
index 721b99ad524b3b9443cc3b323c9ce237776b1f89..d104237740f931bdf2695fd9e78f68f5f2f9f0a3 100644
--- a/doc/user/chapters/using.org
+++ b/doc/user/chapters/using.org
@@ -914,6 +914,17 @@
      int CHAMELEON_Get     (CHAMELEON_enum param, int *value);
      #+end_src
 
+     * Alternatively, Chameleon can also be configured through environment variables.
+       * *CHAMELEON_GEMM_ALGO* give the possibility to switch among
+         multiple variants of the GEMM algorithms. These variants are
+         *GENERIC* for the generic variant that should work with any
+         configuration; *SUMMA_C* that works for 2D block cyclic
+         distribution of the matrices A, B, and C with a C stationnary
+         version; *SUMMA_A* and *SUMMA_B* are SUMMA variant of the
+         algorithm that works for any distribution with respectively
+         *A*, or *B that are stationnary. Note that the last two
+         variants are only available with the StarPU runtime backend.
+
 **** Auxiliary routines
 
      Reports CHAMELEON version number.
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index d311bf63703e3de4ee27d2bfa15c9127a80db281..de26616f1df0e5f0c07db2e1d03550db1bd78e80 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -181,6 +181,16 @@ typedef enum chameleon_store_e {
     ChamEltwise    = 403, /**< Element by element storage */
 } cham_store_t;
 
+/**
+ * @brief Chameleon GEMM-like algorithms
+ */
+typedef enum chameleon_gemm_e {
+    ChamGemmAlgAuto = -1,
+    ChamGemmAlgGeneric,
+    ChamGemmAlgSummaA,
+    ChamGemmAlgSummaB,
+    ChamGemmAlgSummaC
+} cham_gemm_t;
 
 #define ChameleonTrd            1001
 #define ChameleonBrd            1002
diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h
index d1b0d9b4917c70d8b5fbdab4ac37c23dbc078598..30ec9a1a014a8d8f71debe4dc8065d10ee0ee5b5 100644
--- a/include/chameleon/runtime_struct.h
+++ b/include/chameleon/runtime_struct.h
@@ -84,6 +84,7 @@ typedef struct runtime_option_s {
     int                 parallel;  /**< Enable/Disable the parallel version of submitted tasks   */
     int                 priority;  /**< Define the submitted task priority                       */
     int                 workerid;  /**< Define the prefered worker id to perform the tasks       */
+    int                 forcesub;  /**< Force task submission if true                            */
     size_t              ws_wsize;  /**< Define the worker workspace size                         */
     size_t              ws_hsize;  /**< Define the host workspace size for hybrid CPU/GPU kernel */
     void               *ws_worker; /**< Pointer to the worker workspace (structure)              */
diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index 9a26496f199568818f63751ef98a3c58d46b725f..ae307f760eb78d5ec4b3fec9a41db694211382be 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -65,6 +65,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn,
                         CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
+void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                              cham_trans_t transA, cham_trans_t transB,
+                              int m, int n, int k, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                              const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn );
 void INSERT_TASK_zgeqrt( const RUNTIME_option_t *options,
                          int m, int n, int ib, int nb,
                          const CHAM_desc_t *A, int Am, int An,
@@ -452,4 +458,8 @@ void INSERT_TASK_zgram( const RUNTIME_option_t *options,
                         const CHAM_desc_t *D, int Dm, int Dn,
                         CHAM_desc_t *A, int Am, int An);
 
+void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An );
+void RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
+                                  const CHAM_desc_t *A, int Am, int An );
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index bda8ef2732cdf6a7cb8b9a12128bb9175e9f584e..dec860e8e112b70e2e40798ebc4feda3e16ab46c 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -107,6 +107,10 @@ set(CODELETS_ZSRC
     codelets/codelet_zgesum.c
     codelets/codelet_zcesca.c
     codelets/codelet_zgram.c
+    ##################
+    # Reduction methods
+    ##################
+    codelets/codelet_zgersum.c
     )
 
 set(CODELETS_SRC
diff --git a/runtime/openmp/codelets/codelet_zgemm.c b/runtime/openmp/codelets/codelet_zgemm.c
index b94aff972b1a2168b243445e1a2c527ee3603a4e..43b75d728a73cd2e8e905fa23e35230894acbe24 100644
--- a/runtime/openmp/codelets/codelet_zgemm.c
+++ b/runtime/openmp/codelets/codelet_zgemm.c
@@ -42,3 +42,16 @@ INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     (void)options;
     (void)nb;
 }
+
+void
+INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                         cham_trans_t transA, cham_trans_t transB,
+                         int m, int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/openmp/codelets/codelet_zgersum.c b/runtime/openmp/codelets/codelet_zgersum.c
new file mode 100644
index 0000000000000000000000000000000000000000..b320707c993a1521ef76d1219ae60c86db96113e
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zgersum.c
@@ -0,0 +1,41 @@
+/**
+ *
+ * @file starpu/codelet_zgersum.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgersum OpenMP codelet
+ *
+ * @version 1.2.0
+ * @author Romain Peressoni
+ * @author Mathieu Faverge
+ * @date 2022-02-22
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+
+void
+RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
+{
+    fprintf( stderr, "WARNING: Reductions are not available with OpenMP\n" );
+
+    (void)A;
+    (void)Am;
+    (void)An;
+}
+
+void
+RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
+                             const CHAM_desc_t *A, int Am, int An )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+}
diff --git a/runtime/openmp/control/runtime_options.c b/runtime/openmp/control/runtime_options.c
index e626e9fa5d419dd3f79630575e1d4b45fbcf70cb..541e7cb2fcff50d45999d728cdf4c423b7b82c9f 100644
--- a/runtime/openmp/control/runtime_options.c
+++ b/runtime/openmp/control/runtime_options.c
@@ -30,6 +30,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->parallel  = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
     options->priority  = RUNTIME_PRIORITY_MIN;
     options->workerid  = -1;
+    options->forcesub  = 0;
     options->ws_wsize  = 0;
     options->ws_hsize  = 0;
     options->ws_worker = NULL;
diff --git a/runtime/parsec/codelets/codelet_zgemm.c b/runtime/parsec/codelets/codelet_zgemm.c
index 303aa03ba02378093a7cedc5d02c5ea00eee9efe..5a0938ed97e3dc116c18007b55897ddad0253c75 100644
--- a/runtime/parsec/codelets/codelet_zgemm.c
+++ b/runtime/parsec/codelets/codelet_zgemm.c
@@ -89,3 +89,16 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+void
+INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                         cham_trans_t transA, cham_trans_t transB,
+                         int m, int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/parsec/codelets/codelet_zgersum.c b/runtime/parsec/codelets/codelet_zgersum.c
new file mode 100644
index 0000000000000000000000000000000000000000..fb460724d0ab51b55f1ea5833af0dd7dba5d9912
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zgersum.c
@@ -0,0 +1,41 @@
+/**
+ *
+ * @file starpu/codelet_zgersum.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgersum Parsec codelet
+ *
+ * @version 1.2.0
+ * @author Romain Peressoni
+ * @author Mathieu Faverge
+ * @date 2022-02-22
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+
+void
+RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
+{
+    fprintf( stderr, "WARNING: Reductions are not available with Parsec\n" );
+
+    (void)A;
+    (void)Am;
+    (void)An;
+}
+
+void
+RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
+                             const CHAM_desc_t *A, int Am, int An )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+}
diff --git a/runtime/parsec/control/runtime_options.c b/runtime/parsec/control/runtime_options.c
index 0fe1498b236ccfad82d75898992094b0cb1c785a..1a4e9e83a2ffef663e6eff6e61200bfb5af5110c 100644
--- a/runtime/parsec/control/runtime_options.c
+++ b/runtime/parsec/control/runtime_options.c
@@ -27,7 +27,8 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->profiling  = CHAMELEON_STATISTICS == CHAMELEON_TRUE;
     options->parallel   = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
     options->priority   = RUNTIME_PRIORITY_MIN;
-    options->workerid  = -1;
+    options->workerid   = -1;
+    options->forcesub   = 0;
     options->ws_wsize   = 0;
     options->ws_hsize   = 0;
     options->ws_worker  = NULL;
diff --git a/runtime/quark/codelets/codelet_zgemm.c b/runtime/quark/codelets/codelet_zgemm.c
index 487fe27cc6a57fe548ed2a0b81cdddc26b3a517d..d854c638a2f01e68018ca0973e67ce8a4d0604a6 100644
--- a/runtime/quark/codelets/codelet_zgemm.c
+++ b/runtime/quark/codelets/codelet_zgemm.c
@@ -75,3 +75,16 @@ void INSERT_TASK_zgemm(const RUNTIME_option_t *options,
                       sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),                 accessC,
                       0);
 }
+
+void
+INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                         cham_trans_t transA, cham_trans_t transB,
+                         int m, int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    INSERT_TASK_zgemm( options, transA, transB, m, n, k, nb,
+                       alpha, A, Am, An, B, Bm, Bn,
+                       beta, C, Cm, Cn );
+}
diff --git a/runtime/quark/codelets/codelet_zgersum.c b/runtime/quark/codelets/codelet_zgersum.c
new file mode 100644
index 0000000000000000000000000000000000000000..e6accf1cf76c9b6af4e1ec918f09d8ac62e0dca5
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zgersum.c
@@ -0,0 +1,41 @@
+/**
+ *
+ * @file starpu/codelet_zgersum.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgersum Quark codelet
+ *
+ * @version 1.2.0
+ * @author Romain Peressoni
+ * @author Mathieu Faverge
+ * @date 2022-02-22
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+
+void
+RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
+{
+    fprintf( stderr, "WARNING: Reductions are not available with Quark\n" );
+
+    (void)A;
+    (void)Am;
+    (void)An;
+}
+
+void
+RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
+                             const CHAM_desc_t *A, int Am, int An )
+{
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+}
diff --git a/runtime/quark/control/runtime_options.c b/runtime/quark/control/runtime_options.c
index 4723d28e2810327016c754117ca12a9609a912e9..e0f58ee5d62fb2cbde7f8d4fb9e31f964726107b 100644
--- a/runtime/quark/control/runtime_options.c
+++ b/runtime/quark/control/runtime_options.c
@@ -40,6 +40,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->parallel   = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
     options->priority   = RUNTIME_PRIORITY_MIN;
     options->workerid   = -1;
+    options->forcesub   = 0;
 
     options->ws_wsize   = 0;
     options->ws_hsize   = 0;
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 3fb2514a9a98c7b08f37d700e87dde37170d73e2..cf460a8d1d0bb531b4b170975196ee5331589ac0 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -129,6 +129,18 @@ if ( STARPU_FOUND )
     if ( HAVE_STARPU_MPI_COMM_GET_ATTR )
       message("-- ${Blue}Add definition HAVE_STARPU_MPI_COMM_GET_ATTR${ColourReset}")
     endif()
+    check_c_source_compiles("
+#include <starpu.h>
+int main(void) {
+  enum starpu_data_access_mode access = STARPU_MPI_REDUX;
+  return 0;
+}
+"
+      HAVE_STARPU_MPI_REDUX
+      )
+    if ( HAVE_STARPU_MPI_REDUX )
+      message("-- ${Blue}Add definition HAVE_STARPU_MPI_REDUX${ColourReset}")
+    endif()
   endif()
 
   if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 912b44f1391cebe46b3434bff9df80cdb9c799fa..bf72c91c2cd697268438ac9641f977be4b77cd51 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -98,6 +98,96 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg )
  */
 CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
 
+void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                              cham_trans_t transA, cham_trans_t transB,
+                              int m, int n, int k, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                                    beta, C, Cm, Cn );
+    }
+
+    struct cl_zgemm_args_s  *clargs = NULL;
+    void (*callback)(void*);
+    int                      accessC;
+    int                      exec    = 0;
+    char                    *cl_name = "zgemm_Astat";
+
+    /* Handle cache */
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+     /* Check A as write, since it will be the owner of the computation */
+    CHAMELEON_ACCESS_W(A, Am, An);
+    CHAMELEON_ACCESS_R(B, Bm, Bn);
+     /* Check C as read, since it will be used in a reduction */
+    CHAMELEON_ACCESS_R(C, Cm, Cn);
+    exec = __chameleon_need_exec;
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
+        clargs->transA = transA;
+        clargs->transB = transB;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->k      = k;
+        clargs->alpha  = alpha;
+        clargs->tileA  = A->get_blktile( A, Am, An );
+        clargs->tileB  = B->get_blktile( B, Bm, Bn );
+        clargs->beta   = beta;
+        clargs->tileC  = C->get_blktile( C, Cm, Cn );
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zgemm_callback : NULL;
+
+    /* Reduce the C access if needed */
+    if ( beta == 0. ) {
+        accessC = STARPU_W;
+    }
+#if defined(HAVE_STARPU_MPI_REDUX)
+    else if ( beta == 1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+    else {
+        accessC = STARPU_RW;
+    }
+
+#if defined(CHAMELEON_KERNELS_TRACE)
+    {
+        char *cl_fullname;
+        chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name );
+        cl_name = cl_fullname;
+    }
+#endif
+
+    /* Insert the task */
+    rt_starpu_insert_task(
+        &cl_zgemm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_NODE,   A->get_rankof(A, Am, An),
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              cl_name,
+#endif
+
+        0 );
+
+    (void)nb;
+}
+
 void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                         cham_trans_t transA, cham_trans_t transB,
                         int m, int n, int k, int nb,
diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c
new file mode 100644
index 0000000000000000000000000000000000000000..26327c982f997473de0e17885a8381a258ecaeb6
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zgersum.c
@@ -0,0 +1,129 @@
+/**
+ *
+ * @file starpu/codelet_zgersum.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2022 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zgersum StarPU codelet
+ *
+ * @version 1.2.0
+ * @author Romain Peressoni
+ * @author Mathieu Faverge
+ * @date 2022-02-22
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg)
+{
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    assert( tileA->m == tileB->m );
+    assert( tileA->n == tileB->n );
+
+    TCORE_zgeadd( ChamNoTrans, tileA->m, tileA->n, 1., tileB, 1., tileA );
+
+    return;
+}
+
+#ifdef CHAMELEON_USE_CUBLAS
+static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg)
+{
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
+    CHAMELEON_Complex64_t zone = 1.;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    assert( tileA->m == tileB->m );
+    assert( tileA->n == tileB->n );
+
+    CUDA_zgeadd( ChamNoTrans, tileA->m, tileA->n,
+                 &zone, tileB->mat, tileB->ld,
+                 &zone, tileA->mat, tileA->ld,
+                 handle );
+
+    return;
+}
+#endif /* defined(CHAMELEON_USE_CUBLAS) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+#if defined(CHAMELEON_USE_CUBLAS)
+CODELETS(zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC)
+#else
+CODELETS_CPU(zgersum_redux, cl_zgersum_redux_cpu_func)
+#endif
+
+#if !defined(CHAMELEON_SIMULATION)
+static void
+cl_zgersum_init_cpu_func( void *descr[], void *cl_arg )
+{
+    CHAM_tile_t *tileA;
+
+    tileA = cti_interface_get(descr[0]);
+
+    TCORE_zlaset( ChamUpperLower, tileA->m, tileA->n, 0., 0., tileA );
+
+    (void)cl_arg;
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zgersum_init, cl_zgersum_init_cpu_func );
+
+void
+RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An )
+{
+#if defined(HAVE_STARPU_MPI_REDUX)
+    starpu_data_set_reduction_methods( RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+                                       &cl_zgersum_redux,
+                                       &cl_zgersum_init );
+#endif
+}
+
+void
+RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
+                             const CHAM_desc_t *A, int Am, int An )
+{
+#if defined(HAVE_STARPU_MPI_REDUX) && defined(CHAMELEON_USE_MPI)
+    starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD,
+                                     RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+                                     options->priority + 1,
+                                     2 /* Arbre binaire */ );
+#else
+    (void)options;
+    (void)A;
+    (void)Am;
+    (void)An;
+#endif
+}
+
+void RUNTIME_zgersum_init( void ) __attribute__( ( constructor ) );
+void RUNTIME_zgersum_init( void )
+{
+    cl_zgersum_init.nbuffers = 1;
+    cl_zgersum_init.modes[0] = STARPU_W;
+
+    cl_zgersum_redux.nbuffers = 2;
+    cl_zgersum_redux.modes[0] = STARPU_RW | STARPU_COMMUTE;
+    cl_zgersum_redux.modes[1] = STARPU_R;
+}
diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c
index 0fa917acc326e69c03609d750ae15cd0675bf9ec..373a5c3bd395390d63bf5b2ffb7b08aea8d35a17 100644
--- a/runtime/starpu/control/runtime_options.c
+++ b/runtime/starpu/control/runtime_options.c
@@ -31,6 +31,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt,
     options->parallel  = CHAMELEON_PARALLEL == CHAMELEON_TRUE;
     options->priority  = RUNTIME_PRIORITY_MIN;
     options->workerid  = (schedopt == NULL) ? -1 : schedopt->workerid;
+    options->forcesub  = 0;
     options->ws_wsize  = 0;
     options->ws_hsize  = 0;
     options->ws_worker = NULL;
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index c0c71d25965e13c2f98bbf8d50624b5488058af9..9b7d355b9e66dbc1dde3458ea351412177e00f67 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -45,6 +45,7 @@
 #cmakedefine HAVE_STARPU_MPI_WAIT_FOR_ALL
 #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER
 #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER
+#cmakedefine HAVE_STARPU_MPI_REDUX
 
 #if (!defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER) && !defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER)) && defined(CHAMELEON_USE_MPI_DATATYPES)
 #error "This version of StarPU does not support MPI datatypes (Please compile with -DCHAMELEON_USE_MPI_DATATYPES=OFF)"