diff --git a/ChangeLog b/ChangeLog
index 1328ff7e129c969c0b5d994560fa539135a726ec..23cd6845ae17427484f1122274166e723f43554b 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,5 +1,8 @@
 chameleon-1.4.0
 ------------------------------------------------------------------------
+ - StarPU: When using starpu > 1.4.8, use the new distributed submit interface in the codelets instead of the classical insert task interface.
+ - ci: use -Werror to prevent from adding warning to the code
+ - Fix: all warning reported by the switch to -Werror
 
 chameleon-1.3.0
 ------------------------------------------------------------------------
diff --git a/control/async.c b/control/async.c
index d933e381789144622826025f79a1737a3a57d3dc..d877bca7d9d5a0f883eea642dd39565690fad15c 100644
--- a/control/async.c
+++ b/control/async.c
@@ -48,6 +48,9 @@ int chameleon_sequence_create(CHAM_context_t *chamctxt, RUNTIME_sequence_t **seq
         return CHAMELEON_ERR_OUT_OF_RESOURCES;
     }
 
+    (*sequence)->comm   = chamctxt->comm;
+    (*sequence)->myrank = RUNTIME_comm_rank( chamctxt );
+
     RUNTIME_sequence_create( chamctxt, *sequence );
 
     (*sequence)->status = CHAMELEON_SUCCESS;
diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h
index cbedc48a172b5993e08003c5e9ba02cac74addfe..d2122d0838f375986641b8cb922f431a48eabbdf 100644
--- a/include/chameleon/runtime_struct.h
+++ b/include/chameleon/runtime_struct.h
@@ -84,9 +84,10 @@ typedef struct runtime_request_s {
  */
 typedef struct runtime_sequence_s {
     int                status;   /**< Return status registered by the tasks for the request     */
+    int                myrank;   /**< MPI Comm rank within the associated communicator          */
     RUNTIME_request_t *request;  /**< Pointer to the request that failed if any, NULL otherwise */
     void              *schedopt; /**< Specific runtime data pointer to handle the sequence      */
-    MPI_Comm           comm;     /**< MPI communicator                                         */
+    MPI_Comm           comm;     /**< MPI communicator                                          */
 } RUNTIME_sequence_t;
 
 /**
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 801a1f89a5c037f9cdd6ea48400d8d81a7538a01..b7313a36f2a9c5ac217ca3d45f6d5b4fa78f7145 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -180,6 +180,19 @@ int main() {
     if ( HAVE_STARPU_MPI_DATA_CPY_PRIORITY )
       message("-- ${Blue}Add definition HAVE_STARPU_MPI_DATA_CPY_PRIORITY${ColourReset}")
     endif()
+
+    check_function_exists(starpu_mpi_exchange_data_before_execution HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION)
+    if ( HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION )
+      message("-- ${Blue}Add definition HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION${ColourReset}")
+    endif()
+
+  endif()
+
+  if ( CHAMELEON_USE_MPI AND NOT HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION )
+    set( CHAMELEON_STARPU_USE_INSERT ON CACHE BOOL "Enable the task insert interface instead of the task submit" FORCE)
+  else()
+    option( CHAMELEON_STARPU_USE_INSERT
+      "Enable the task insert interface instead of the task submit" OFF )
   endif()
 
   if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index b276d515708d9395652e7f1ecac567a0d56bceec..dfdb27fc4666b2fa84c63eb7621224b9a5cad256 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -25,23 +25,27 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zgeadd_args_s {
+    cham_trans_t          trans;
+    int                   m;
+    int                   n;
+    CHAMELEON_Complex64_t alpha;
+    CHAMELEON_Complex64_t beta;
+};
+
 #if !defined(CHAMELEON_SIMULATION)
 static void
 cl_zgeadd_cpu_func( void *descr[], void *cl_arg )
 {
-    cham_trans_t          trans;
-    int                   M;
-    int                   N;
-    CHAMELEON_Complex64_t alpha;
-    CHAM_tile_t          *tileA;
-    CHAMELEON_Complex64_t beta;
-    CHAM_tile_t          *tileB;
+    struct cl_zgeadd_args_s *clargs = (struct cl_zgeadd_args_s *)cl_arg;
+    CHAM_tile_t *tileA;
+    CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta );
-    TCORE_zgeadd( trans, M, N, alpha, tileA, beta, tileB );
+    TCORE_zgeadd( clargs->trans, clargs->m, clargs->n,
+                  clargs->alpha, tileA, clargs->beta, tileB );
 
     return;
 }
@@ -50,22 +54,17 @@ cl_zgeadd_cpu_func( void *descr[], void *cl_arg )
 static void
 cl_zgeadd_cuda_func( void *descr[], void *cl_arg )
 {
-    cublasHandle_t  handle = starpu_cublas_get_local_handle();
-    cham_trans_t    trans;
-    int             M;
-    int             N;
-    cuDoubleComplex alpha;
-    CHAM_tile_t    *tileA;
-    cuDoubleComplex beta;
-    CHAM_tile_t    *tileB;
+    struct cl_zgeadd_args_s *clargs = (struct cl_zgeadd_args_s *)cl_arg;
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t   *tileA;
+    CHAM_tile_t   *tileB;
 
     tileA = cti_interface_get( descr[0] );
     tileB = cti_interface_get( descr[1] );
-    starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta );
 
-    CUDA_zgeadd( trans, M, N,
-                 &alpha, tileA->mat, tileA->ld,
-                 &beta,  tileB->mat, tileB->ld,
+    CUDA_zgeadd( clargs->trans, clargs->m, clargs->n,
+                 (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
+                 (cuDoubleComplex*)&(clargs->beta),  tileB->mat, tileB->ld,
                  handle );
 
     return;
@@ -78,6 +77,7 @@ cl_zgeadd_cuda_func( void *descr[], void *cl_arg )
  */
 CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
                          cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
@@ -89,28 +89,124 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
         return;
     }
 
-    struct starpu_codelet *codelet = &cl_zgeadd;
-    void (*callback)(void*) = options->profiling ? cl_zgeadd_callback : NULL;
-    int accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    void (*callback)(void*);
+    struct cl_zgeadd_args_s *clargs  = NULL;
+    int                      exec    = 0;
+    const char              *cl_name = "zgeadd";
+    int                      accessB;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(B, Bm, Bn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zgeadd_args_s ) );
+        clargs->trans = trans;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+    }
+
+    /* Callback fro profiling information */
+    callback = options->profiling ? cl_zgeadd_callback : NULL;
+
+    /* Reduce the B access if needed */
+    accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &trans,              sizeof(int),
-        STARPU_VALUE,    &m,                  sizeof(int),
-        STARPU_VALUE,    &n,                  sizeof(int),
-        STARPU_VALUE,    &alpha,              sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,         RTBLKADDR(A, ChamComplexDouble, Am, An),
-        STARPU_VALUE,    &beta,               sizeof(CHAMELEON_Complex64_t),
-        accessB,          RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
+        &cl_zgeadd,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgeadd_args_s),
+        STARPU_R,      RTBLKADDR(A, ChamComplexDouble, Am, An),
+        accessB,       RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_NAME,              cl_name,
         0 );
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
+                         cham_trans_t trans, int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                             beta, B, Bm, Bn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zgeadd, 2 );
+    int accessB;
+
+    /* Reduce the B access if needed */
+    accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, accessB  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zgeadd_args_s ) );
+        clargs->trans = trans;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zgeadd_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgeadd_callback );
+
+        /* Flops */
+        //task->flops = flops_zgeadd( m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zgeadd", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index d4895c5daaa5999eba42e89e2cd2b6db0fbb59a9..1598f7ebcfbf83ed73cce710b7fe6150ffac1c85 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -126,6 +126,7 @@ CODELETS_GPU( zgemm, cl_zgemm_cpu_func, cl_zgemm_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                               cham_trans_t transA, cham_trans_t transB,
                               int m, int n, int k, int nb,
@@ -139,12 +140,12 @@ void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zgemm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
+    struct cl_zgemm_args_s  *clargs  = NULL;
     int                      exec    = 0;
     const char              *cl_name = "zgemm_Astat";
     uint32_t                 where   = cl_zgemm.where;
+    int                      accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -229,12 +230,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zgemm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
-    int                      exec = 0;
+    struct cl_zgemm_args_s  *clargs  = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "zgemm";
     uint32_t                 where   = cl_zgemm.where;
+    int                      accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -295,3 +296,133 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         STARPU_EXECUTE_WHERE,     where,
         0 );
 }
+
+#else
+
+void __INSERT_TASK_zgemm( const RUNTIME_option_t *options,
+                          int xrank, int accessC,
+                          cham_trans_t transA, cham_trans_t transB,
+                          int m, int n, int k, int nb,
+                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                       const CHAM_desc_t *B, int Bm, int Bn,
+                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == (CHAMELEON_Complex64_t)0. ) {
+        INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zgemm, 3 );
+
+    /*
+     * Register the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, xrank );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* WARNING: CUDA 12.3 has an issue when k=1 in complex, thus we disable gemm on gpu in these cases */
+#if defined(PRECISION_z) || defined(PRECISION_c)
+        if ( k == 1 ) {
+            task->where = STARPU_CPU;
+        }
+#endif
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
+        clargs->transA = transA;
+        clargs->transB = transB;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->k      = k;
+        clargs->alpha  = alpha;
+        clargs->beta   = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zgemm_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgemm_callback );
+
+        /* Flops */
+        task->flops = flops_zgemm( m, n, k );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 3,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
+                              cham_trans_t transA, cham_trans_t transB,
+                              int m, int n, int k, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
+
+#if defined(HAVE_STARPU_MPI_REDUX)
+    if ( beta == (CHAMELEON_Complex64_t)1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+
+    __INSERT_TASK_zgemm( options,
+                         A->get_rankof( A, Am, An ), accessC,
+                         transA, transB, m, n, k, nb,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+
+void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
+                        cham_trans_t transA, cham_trans_t transB,
+                        int m, int n, int k, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
+        (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));
+
+    __INSERT_TASK_zgemm( options,
+                         C->get_rankof( C, Cm, Cn ), accessC,
+                         transA, transB, m, n, k, nb,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+#endif
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index 5210970da0610040b9285b964764c24faa95bc37..ce592b7c818785a3615ef8c0a95a38e2856f7ec9 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -28,10 +28,10 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zhemm_args_s {
-    cham_side_t side;
-    cham_uplo_t uplo;
-    int m;
-    int n;
+    cham_side_t           side;
+    cham_uplo_t           uplo;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
     CHAMELEON_Complex64_t beta;
 };
@@ -125,6 +125,7 @@ CODELETS_GPU( zhemm, cl_zhemm_cpu_func, cl_zhemm_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
                               int m, int n, int nb,
@@ -138,11 +139,11 @@ void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zhemm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
+    struct cl_zhemm_args_s  *clargs  = NULL;
     int                      exec    = 0;
     const char              *cl_name = "zhemm_Astat";
+    int                      accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -272,3 +273,122 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
         STARPU_NAME,              cl_name,
         0 );
 }
+
+#else
+
+void __INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                          cham_side_t side, cham_uplo_t uplo,
+                          int m, int n, int nb, int xrank, int accessC,
+                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                       const CHAM_desc_t *B, int Bm, int Bn,
+                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == (CHAMELEON_Complex64_t)0. ) {
+        INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zhemm, 3 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, xrank );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zhemm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zhemm_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zhemm_callback );
+
+        /* Flops */
+        task->flops = flops_zhemm( side, m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 3,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
+
+#if defined(HAVE_STARPU_MPI_REDUX)
+    if ( beta == (CHAMELEON_Complex64_t)1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+
+    __INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                         A->get_rankof( A, Am, An ), accessC,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+
+void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
+        (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));
+
+    __INSERT_TASK_zhemm( options, side, uplo, m, n, nb,
+                         C->get_rankof( C, Cm, Cn ), accessC,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+#endif
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 587f8cb87e2f739e6a378a97d9c59f60c6fa1710..7176bbe393513f4d9db74f48b19c7b7d63dabbef 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -27,53 +27,50 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zher2k_args_s {
+    cham_uplo_t           uplo;
+    cham_trans_t          trans;
+    int                   n;
+    int                   k;
+    CHAMELEON_Complex64_t alpha;
+    double                beta;
+};
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zher2k_cpu_func(void *descr[], void *cl_arg)
 {
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    CHAMELEON_Complex64_t alpha;
+    struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    double beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-    TCORE_zher2k(uplo, trans,
-                n, k, alpha, tileA, tileB, beta, tileC);
+    TCORE_zher2k( clargs->uplo, clargs->trans,
+                  clargs->n, clargs->k, clargs->alpha,
+                  tileA, tileB, clargs->beta, tileC );
 }
 
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 {
     cublasHandle_t handle = starpu_cublas_get_local_handle();
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    cuDoubleComplex alpha;
+    struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    double beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-
-    CUDA_zher2k( uplo, trans,
-                 n, k,
-                 &alpha, tileA->mat, tileA->ld,
-                         tileB->mat, tileB->ld,
-                 &beta,  tileC->mat, tileC->ld,
+    CUDA_zher2k( clargs->uplo, clargs->trans,
+                 clargs->n, clargs->k,
+                 (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
+                                                     tileB->mat, tileB->ld,
+                 &(clargs->beta),                    tileC->mat, tileC->ld,
                  handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
@@ -82,28 +79,21 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg)
 static void cl_zher2k_hip_func(void *descr[], void *cl_arg)
 {
     hipblasHandle_t handle = starpu_hipblas_get_local_handle();
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    hipblasDoubleComplex alpha;
+    struct cl_zher2k_args_s *clargs = (struct cl_zher2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    double beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-
-    HIP_zher2k( uplo, trans,
-                 n, k,
-                 &alpha, tileA->mat, tileA->ld,
-                         tileB->mat, tileB->ld,
-                 &beta,  tileC->mat, tileC->ld,
-                 handle );
+    HIP_zher2k( clargs->uplo, clargs->trans,
+                clargs->n, clargs->k,
+                (hipblasDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
+                                                         tileB->mat, tileB->ld,
+                &(clargs->beta),                         tileC->mat, tileC->ld,
+                handle );
 }
 #endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -117,18 +107,13 @@ CODELETS_GPU( zher2k, cl_zher2k_cpu_func, cl_zher2k_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
-void
-INSERT_TASK_zher2k( const RUNTIME_option_t *options,
-                    cham_uplo_t uplo, cham_trans_t trans,
-                    int n, int k, int nb,
-                    CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                                                 const CHAM_desc_t *B, int Bm, int Bn,
-                    double beta,                 const CHAM_desc_t *C, int Cm, int Cn )
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         double beta,                 const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         INSERT_TASK_zlascal( options, uplo, n, n, nb,
@@ -136,30 +121,139 @@ INSERT_TASK_zher2k( const RUNTIME_option_t *options,
         return;
     }
 
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zher2k;
-    void (*callback)(void*) = options->profiling ? cl_zher2k_callback : NULL;
-    int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    void (*callback)(void*);
+    struct cl_zher2k_args_s *clargs  = NULL;
+    int                      exec    = 0;
+    const char              *cl_name = "zher2k";
+    int                      accessC;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zher2k_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+    }
+
+    /* Callback fro profiling information */
+    callback = options->profiling ? cl_zher2k_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
+    /* Refine name */
+    cl_name = chameleon_codelet_name( cl_name, 3,
+                                      A->get_blktile( A, Am, An ),
+                                      B->get_blktile( B, Bm, Bn ),
+                                      C->get_blktile( C, Cm, Cn ) );
+
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,      &uplo,                sizeof(int),
-        STARPU_VALUE,     &trans,                sizeof(int),
-        STARPU_VALUE,         &n,                        sizeof(int),
-        STARPU_VALUE,         &k,                        sizeof(int),
-        STARPU_VALUE,     &alpha,         sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
-        STARPU_R,                 RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
-        STARPU_VALUE,      &beta,                     sizeof(double),
-        accessC,                  RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
-        STARPU_PRIORITY,    options->priority,
-        STARPU_CALLBACK,    callback,
+        &cl_zher2k,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zher2k_args_s),
+        STARPU_R,      RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_R,      RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+        accessC,       RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
+        STARPU_NAME,              cl_name,
         0 );
+
+    (void)nb;
+}
+
+#else
+
+void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         double beta,                 const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zher2k, 3 );
+    int accessC;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == (double)0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zher2k_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zher2k_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zher2k_callback );
+
+        /* Flops */
+        task->flops = flops_zher2k( k, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 3,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zher2k", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
 }
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index bcd9eb1ff2195d223dfbb10770ceb9e3e2b4872f..2a3eda14079e069ec08f7c090ce6bf9cab00cebb 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -29,12 +29,12 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zherk_args_s {
-    cham_uplo_t uplo;
+    cham_uplo_t  uplo;
     cham_trans_t trans;
-    int n;
-    int k;
-    double alpha;
-    double beta;
+    int          n;
+    int          k;
+    double       alpha;
+    double       beta;
 };
 
 #if !defined(CHAMELEON_SIMULATION)
@@ -109,6 +109,7 @@ CODELETS_GPU( zherk, cl_zherk_cpu_func, cl_zherk_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
@@ -121,11 +122,11 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zherk_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
-    int                      exec = 0;
-    const char              *cl_name = "zherk";
+    struct cl_zherk_args_s *clargs  = NULL;
+    int                     exec    = 0;
+    const char             *cl_name = "zherk";
+    int                     accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -173,3 +174,83 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zherk( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo, cham_trans_t trans,
+                        int n, int k, int nb,
+                        double alpha, const CHAM_desc_t *A, int Am, int An,
+                        double beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zherk, 2 );
+    int accessC;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == (double)0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zherk_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zherk_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zherk_callback );
+
+        /* Flops */
+        task->flops = flops_zherk( k, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zherk", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c
index 35fb10c358ec7e62610948cb79c306845641a373..c984f574d114672a038999219a54c7eb1cb32008 100644
--- a/runtime/starpu/codelets/codelet_zlascal.c
+++ b/runtime/starpu/codelets/codelet_zlascal.c
@@ -25,9 +25,9 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zlascal_args_s {
-    cham_uplo_t uplo;
-    int m;
-    int n;
+    cham_uplo_t           uplo;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
 };
 
@@ -49,6 +49,7 @@ cl_zlascal_cpu_func( void *descr[], void *cl_arg )
  */
 CODELETS_CPU( zlascal, cl_zlascal_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
                           cham_uplo_t uplo,
                           int m, int n, int nb,
@@ -64,10 +65,10 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zlascal_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      exec = 0;
-    const char              *cl_name = "zlascal";
+    struct cl_zlascal_args_s *clargs  = NULL;
+    int                       exec    = 0;
+    const char               *cl_name = "zlascal";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -102,3 +103,78 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
+                          cham_uplo_t uplo,
+                          int m, int n, int nb,
+                          CHAMELEON_Complex64_t alpha,
+                          const CHAM_desc_t *A, int Am, int An )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlaset( options, uplo, m, n,
+                            alpha, alpha, A, Am, An );
+        return;
+    }
+    else if ( alpha == 1. ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zlascal, 1 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, A->get_rankof( A, Am, An ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_RW );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zlascal_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zlascal_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zlascal_callback );
+
+        /* Flops */
+        //task->flops = flops_zlascal( uplo, m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 1,
+                                             A->get_blktile( A, Am, An ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zlascal", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index ee38d7b92d7a67132ddd6a7eea4065a1a317c816..46cd4d7a616ec6113e05d19e60fb360e7c4f1752 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -29,11 +29,11 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zpotrf_args_s {
-    cham_uplo_t uplo;
-    int n;
-    int iinfo;
+    cham_uplo_t         uplo;
+    int                 n;
+    int                 iinfo;
     RUNTIME_sequence_t *sequence;
-    RUNTIME_request_t *request;
+    RUNTIME_request_t  *request;
 };
 
 #if !defined(CHAMELEON_SIMULATION)
@@ -65,14 +65,15 @@ CODELETS( zpotrf, cl_zpotrf_cpu_func, cl_zpotrf_cuda_func, STARPU_CUDA_ASYNC )
 CODELETS_CPU( zpotrf, cl_zpotrf_cpu_func )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n, int nb,
                          const CHAM_desc_t *A, int Am, int An,
                          int iinfo )
 {
-    struct cl_zpotrf_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      exec = 0;
+    struct cl_zpotrf_args_s *clargs  = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "zpotrf";
 
     /* Handle cache */
@@ -114,3 +115,69 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An,
+                         int iinfo )
+{
+    INSERT_TASK_COMMON_PARAMETERS( zpotrf, 1 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, A->get_rankof( A, Am, An ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_RW );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zpotrf_args_s ) );
+        clargs->uplo     = uplo;
+        clargs->n        = n;
+        clargs->iinfo    = iinfo;
+        clargs->sequence = options->sequence;
+        clargs->request  = options->request;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zpotrf_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zpotrf_callback );
+
+        /* Flops */
+        task->flops = flops_zpotrf( n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 1,
+                                             A->get_blktile( A, Am, An ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 5882380fc36964275359a39695c29b8f54c4b344..c3ab90117f31d681fb10c20ec42ad41d63981fe9 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -29,10 +29,10 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zsymm_args_s {
-    cham_side_t side;
-    cham_uplo_t uplo;
-    int m;
-    int n;
+    cham_side_t           side;
+    cham_uplo_t           uplo;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
     CHAMELEON_Complex64_t beta;
 };
@@ -126,6 +126,7 @@ CODELETS_GPU( zsymm, cl_zsymm_cpu_func, cl_zsymm_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
                               int m, int n, int nb,
@@ -139,11 +140,11 @@ void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zsymm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
+    struct cl_zsymm_args_s  *clargs  = NULL;
     int                      exec    = 0;
     const char              *cl_name = "zsymm_Astat";
+    int                      accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -219,11 +220,11 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zsymm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
-    int                      exec = 0;
+    struct cl_zsymm_args_s  *clargs  = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "zsymm";
+    int                      accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -273,3 +274,122 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
         STARPU_NAME,              cl_name,
         0 );
 }
+
+#else
+
+void __INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                          cham_side_t side, cham_uplo_t uplo,
+                          int m, int n, int nb, int xrank, int accessC,
+                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                       const CHAM_desc_t *B, int Bm, int Bn,
+                          CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == (CHAMELEON_Complex64_t)0. ) {
+        INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zsymm, 3 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, xrank );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zsymm_args_s ) );
+        clargs->side  = side;
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zsymm_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsymm_callback );
+
+        /* Flops */
+        task->flops = flops_zsymm( side, m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 3,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zpotrf", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
+                              cham_side_t side, cham_uplo_t uplo,
+                              int m, int n, int nb,
+                              CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                           const CHAM_desc_t *B, int Bm, int Bn,
+                              CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
+
+#if defined(HAVE_STARPU_MPI_REDUX)
+    if ( beta == (CHAMELEON_Complex64_t)1. ) {
+        accessC = STARPU_MPI_REDUX;
+    }
+#endif
+
+    __INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
+                         A->get_rankof( A, Am, An ), accessC,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+
+void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                     const CHAM_desc_t *B, int Bm, int Bn,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    /* Reduce the C access if needed */
+    int accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W :
+        (STARPU_RW | ((beta == (CHAMELEON_Complex64_t)1.) ? STARPU_COMMUTE : 0));
+
+    __INSERT_TASK_zsymm( options, side, uplo, m, n, nb,
+                         C->get_rankof( C, Cm, Cn ), accessC,
+                         alpha, A, Am, An,
+                                B, Bm, Bn,
+                         beta,  C, Cm, Cn );
+}
+#endif
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 179c2024e6bb13228e87830c46766bbae96002c5..f8675eceec9a1032a3bd5b61b717eb39e45d64db 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -27,53 +27,51 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zsyr2k_args_s {
+    cham_uplo_t           uplo;
+    cham_trans_t          trans;
+    int                   n;
+    int                   k;
+    CHAMELEON_Complex64_t alpha;
+    CHAMELEON_Complex64_t beta;
+};
+
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg)
 {
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    CHAMELEON_Complex64_t alpha;
+    struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    CHAMELEON_Complex64_t beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-    TCORE_zsyr2k(uplo, trans,
-                 n, k, alpha, tileA, tileB, beta, tileC);
+    TCORE_zsyr2k( clargs->uplo, clargs->trans,
+                  clargs->n, clargs->k, clargs->alpha,
+                  tileA, tileB, clargs->beta, tileC );
 }
 
 #if defined(CHAMELEON_USE_CUDA)
 static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 {
     cublasHandle_t handle = starpu_cublas_get_local_handle();
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    cuDoubleComplex alpha;
+    struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    cuDoubleComplex beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-
-    CUDA_zsyr2k( uplo, trans,
-                 n, k,
-                 &alpha, tileA->mat, tileA->ld,
-                         tileB->mat, tileB->ld,
-                 &beta,  tileC->mat, tileC->ld,
+    CUDA_zsyr2k( clargs->uplo, clargs->trans,
+                 clargs->n, clargs->k,
+                 (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
+                                                     tileB->mat, tileB->ld,
+                 (cuDoubleComplex*)&(clargs->beta),  tileC->mat, tileC->ld,
                  handle );
 }
 #endif /* defined(CHAMELEON_USE_CUDA) */
@@ -82,28 +80,21 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg)
 static void cl_zsyr2k_hip_func(void *descr[], void *cl_arg)
 {
     hipblasHandle_t handle = starpu_hipblas_get_local_handle();
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
-    hipblasDoubleComplex alpha;
+    struct cl_zsyr2k_args_s *clargs = (struct cl_zsyr2k_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
-    hipblasDoubleComplex beta;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta);
-
-    HIP_zsyr2k( uplo, trans,
-                 n, k,
-                 &alpha, tileA->mat, tileA->ld,
-                         tileB->mat, tileB->ld,
-                 &beta,  tileC->mat, tileC->ld,
-                 handle );
+    HIP_zsyr2k( clargs->uplo, clargs->trans,
+                clargs->n, clargs->k,
+                (hipblasDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld,
+                                                         tileB->mat, tileB->ld,
+                (hipblasDoubleComplex*)&(clargs->beta),  tileC->mat, tileC->ld,
+                handle );
 }
 #endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -117,17 +108,13 @@ CODELETS_GPU( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
-/**
- *
- * @ingroup INSERT_TASK_Complex64_t
- *
- */
-void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options,
-                       cham_uplo_t uplo, cham_trans_t trans,
-                       int n, int k, int nb,
-                       CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
-                       const CHAM_desc_t *B, int Bm, int Bn,
-                       CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn)
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
 {
     if ( alpha == 0. ) {
         INSERT_TASK_zlascal( options, uplo, n, n, nb,
@@ -135,30 +122,139 @@ void INSERT_TASK_zsyr2k(const RUNTIME_option_t *options,
         return;
     }
 
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zsyr2k;
-    void (*callback)(void*) = options->profiling ? cl_zsyr2k_callback : NULL;
-    int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+    void (*callback)(void*);
+    struct cl_zsyr2k_args_s *clargs  = NULL;
+    int                      exec    = 0;
+    const char              *cl_name = "zsyr2k";
+    int                      accessC;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+    }
+
+    /* Callback fro profiling information */
+    callback = options->profiling ? cl_zsyr2k_callback : NULL;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
+    /* Refine name */
+    cl_name = chameleon_codelet_name( cl_name, 3,
+                                      A->get_blktile( A, Am, An ),
+                                      B->get_blktile( B, Bm, Bn ),
+                                      C->get_blktile( C, Cm, Cn ) );
+
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,      &uplo,                sizeof(int),
-        STARPU_VALUE,     &trans,                sizeof(int),
-        STARPU_VALUE,         &n,                        sizeof(int),
-        STARPU_VALUE,         &k,                        sizeof(int),
-        STARPU_VALUE,     &alpha,         sizeof(CHAMELEON_Complex64_t),
-        STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
-        STARPU_R,                 RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
-        STARPU_VALUE,      &beta,         sizeof(CHAMELEON_Complex64_t),
-        accessC,                  RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
-        STARPU_PRIORITY,    options->priority,
-        STARPU_CALLBACK,    callback,
+        &cl_zsyr2k,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyr2k_args_s),
+        STARPU_R,      RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_R,      RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+        accessC,       RTBLKADDR(C, ChamComplexDouble, Cm, Cn),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
+        STARPU_NAME,              cl_name,
         0 );
+
+    (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans,
+                         int n, int k, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                                                      const CHAM_desc_t *B, int Bm, int Bn,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zsyr2k, 3 );
+    int accessC;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zsyr2k_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zsyr2k_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsyr2k_callback );
+
+        /* Flops */
+        task->flops = flops_zsyr2k( k, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 3,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zsyr2k", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index e92f2b85f496c9f551f082b6c64a9fd966d892e7..32ded083dbad46d6a9a2ce2b6504e3766a7aac9b 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -30,10 +30,10 @@
 #include "runtime_codelet_z.h"
 
 struct cl_zsyrk_args_s {
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int n;
-    int k;
+    cham_uplo_t           uplo;
+    cham_trans_t          trans;
+    int                   n;
+    int                   k;
     CHAMELEON_Complex64_t alpha;
     CHAMELEON_Complex64_t beta;
 };
@@ -110,6 +110,7 @@ CODELETS_GPU( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_hip_func, STARPU_HIP_ASYNC )
 CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
@@ -122,11 +123,11 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zsyrk_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      accessC;
-    int                      exec = 0;
-    const char              *cl_name = "zsyrk";
+    struct cl_zsyrk_args_s *clargs  = NULL;
+    int                     exec    = 0;
+    const char             *cl_name = "zsyrk";
+    int                     accessC;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -174,3 +175,83 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
+                        cham_uplo_t uplo, cham_trans_t trans,
+                        int n, int k, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                        CHAMELEON_Complex64_t beta,  const CHAM_desc_t *C, int Cm, int Cn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, uplo, n, n, nb,
+                             beta, C, Cm, Cn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zsyrk, 2 );
+    int accessC;
+
+    /* Reduce the C access if needed */
+    accessC = ( beta == (CHAMELEON_Complex64_t)0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, C, Cm, Cn, accessC  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zsyrk_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zsyrk_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zsyrk_callback );
+
+        /* Flops */
+        task->flops = flops_zsyrk( k, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             C->get_blktile( C, Cm, Cn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zsyrk", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c
index a3f695f2c685eb05e66525c6fca957af247dbe42..39c7de9667a60b79bd3ee9eb2e04482c1e67f994 100644
--- a/runtime/starpu/codelets/codelet_ztradd.c
+++ b/runtime/starpu/codelets/codelet_ztradd.c
@@ -24,10 +24,10 @@
 #include "runtime_codelet_z.h"
 
 struct cl_ztradd_args_s {
-    cham_uplo_t uplo;
-    cham_trans_t trans;
-    int m;
-    int n;
+    cham_uplo_t           uplo;
+    cham_trans_t          trans;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
     CHAMELEON_Complex64_t beta;
 };
@@ -53,6 +53,7 @@ cl_ztradd_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU( ztradd, cl_ztradd_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
@@ -64,11 +65,11 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_ztradd_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      accessB;
-    int                      exec = 0;
+    struct cl_ztradd_args_s *clargs  = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "ztradd";
+    int                      accessB;
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -110,3 +111,83 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
+                         cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
+                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                         CHAMELEON_Complex64_t beta,  const CHAM_desc_t *B, int Bm, int Bn )
+{
+    if ( alpha == 0. ) {
+        INSERT_TASK_zlascal( options, uplo, m, n, nb,
+                             beta, B, Bm, Bn );
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( ztradd, 2 );
+    int accessB;
+
+
+    /* Reduce the B access if needed */
+    accessB = ( beta == 0. ) ? STARPU_W : STARPU_RW;
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, accessB  );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_ztradd_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_ztradd_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztradd_callback );
+
+        /* Flops */
+        //task->flops = flops_ztradd( m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_ztradd", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index 229de0304d8fd6b1be2199a25cf3231ea28ec9ca..d9d25ed28b1b5a275df649e124453a81ab93f082 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -28,12 +28,12 @@
 #include "runtime_codelet_z.h"
 
 struct cl_ztrmm_args_s {
-    cham_side_t side;
-    cham_uplo_t uplo;
-    cham_trans_t transA;
-    cham_diag_t diag;
-    int m;
-    int n;
+    cham_side_t           side;
+    cham_uplo_t           uplo;
+    cham_trans_t          transA;
+    cham_diag_t           diag;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
 };
 
@@ -106,15 +106,16 @@ CODELETS_GPU( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_hip_func, STARPU_HIP_ASYNC )
 CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
                         int m, int n, int nb,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn )
 {
-    struct cl_ztrmm_args_s *clargs = NULL;
     void (*callback)(void*);
-    int                      exec = 0;
+    struct cl_ztrmm_args_s *clargs   = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "ztrmm";
 
     /* Handle cache */
@@ -160,3 +161,74 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                        const CHAM_desc_t *B, int Bm, int Bn )
+{
+    INSERT_TASK_COMMON_PARAMETERS( ztrmm, 2 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R  );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+   if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_ztrmm_args_s ) );
+        clargs->side   = side;
+        clargs->uplo   = uplo;
+        clargs->transA = transA;
+        clargs->diag   = diag;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->alpha  = alpha;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_ztrmm_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztrmm_callback );
+
+        /* Flops */
+        task->flops = flops_ztrmm( side, m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_ztrmm", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 8d0656b9915417706d2564dce76e5c5b5b85bc97..6961500fd9518f8624d2fc03d7634695f1e97861 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -30,12 +30,12 @@
 #include "runtime_codelet_z.h"
 
 struct cl_ztrsm_args_s {
-    cham_side_t side;
-    cham_uplo_t uplo;
-    cham_trans_t transA;
-    cham_diag_t diag;
-    int m;
-    int n;
+    cham_side_t           side;
+    cham_uplo_t           uplo;
+    cham_trans_t          transA;
+    cham_diag_t           diag;
+    int                   m;
+    int                   n;
     CHAMELEON_Complex64_t alpha;
 };
 
@@ -113,15 +113,16 @@ CODELETS_GPU( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_hip_func, STARPU_HIP_ASYNC )
 CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
                         int m, int n, int nb,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn )
 {
-    struct cl_ztrsm_args_s  *clargs = NULL;
     void (*callback)(void*);
-    int                      exec = 0;
+    struct cl_ztrsm_args_s  *clargs  = NULL;
+    int                      exec    = 0;
     const char              *cl_name = "ztrsm";
 
     /* Handle cache */
@@ -168,3 +169,74 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
 
     (void)nb;
 }
+
+#else
+
+void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
+                        cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
+                        int m, int n, int nb,
+                        CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
+                        const CHAM_desc_t *B, int Bm, int Bn )
+{
+    INSERT_TASK_COMMON_PARAMETERS( ztrsm, 2 );
+
+    /*
+     * Set the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, A, Am, An, STARPU_R  );
+    starpu_cham_exchange_data_before_execution( options, params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+   if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_ztrsm_args_s ) );
+        clargs->side   = side;
+        clargs->uplo   = uplo;
+        clargs->transA = transA;
+        clargs->diag   = diag;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->alpha  = alpha;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_ztrsm_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_ztrsm_callback );
+
+        /* Flops */
+        task->flops = flops_ztrsm( side, m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 2,
+                                             A->get_blktile( A, Am, An ),
+                                             B->get_blktile( B, Bm, Bn ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_ztrsm", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif
diff --git a/runtime/starpu/control/runtime_async.c b/runtime/starpu/control/runtime_async.c
index 790181977e43c4d721bdb398f9a79701e0a5f1e7..848d0f7aa54291b4a86546ead770ad2f1c422f4a 100644
--- a/runtime/starpu/control/runtime_async.c
+++ b/runtime/starpu/control/runtime_async.c
@@ -24,11 +24,10 @@
 /**
  *  Create a sequence
  */
-int RUNTIME_sequence_create( CHAM_context_t  *chamctxt,
+int RUNTIME_sequence_create( CHAM_context_t     *chamctxt,
                              RUNTIME_sequence_t *sequence )
 {
     (void)chamctxt;
-    sequence->comm = chamctxt->comm;
     return CHAMELEON_SUCCESS;
 }
 
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 2c285f33056c034220f273701823914b1d75c34f..d24dce7a70c5b9eacc2211c983a10fb34220a716 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -55,6 +55,9 @@
 #cmakedefine HAVE_STARPU_MPI_REDUX
 #cmakedefine HAVE_STARPU_MPI_REDUX_WRAPUP
 #cmakedefine HAVE_STARPU_MPI_DATA_CPY_PRIORITY
+#cmakedefine HAVE_STARPU_MPI_EXCHANGE_DATA_BEFORE_EXECUTION
+
+#cmakedefine CHAMELEON_STARPU_USE_INSERT
 
 #if (!defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER) && !defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER)) && defined(CHAMELEON_USE_MPI_DATATYPES)
 #error "This version of StarPU does not support MPI datatypes (Please compile with -DCHAMELEON_USE_MPI_DATATYPES=OFF)"
diff --git a/runtime/starpu/include/chameleon_starpu_internal.h b/runtime/starpu/include/chameleon_starpu_internal.h
index 8b7c3ecd047f308787a9fbb25f755dd0ac0b3b89..977733dc9e72518c539f91a2a45074016ff49d28 100644
--- a/runtime/starpu/include/chameleon_starpu_internal.h
+++ b/runtime/starpu/include/chameleon_starpu_internal.h
@@ -28,6 +28,7 @@
 
 #include "control/common.h"
 #include "chameleon_starpu.h"
+#include "chameleon/flops.h"
 
 /* Chameleon interfaces for StarPU */
 #include "cham_tile_interface.h"
@@ -125,6 +126,18 @@ void RUNTIME_set_reduction_methods(starpu_data_handle_t handle, cham_flttype_t d
 #include "runtime_mpi.h"
 #include "runtime_wontuse.h"
 
+static inline starpu_data_handle_t *
+chameleon_starpu_data_gethandle( const CHAM_desc_t *A, int m, int n )
+{
+    int64_t mm = m + (A->i / A->mb);
+    int64_t nn = n + (A->j / A->nb);
+
+    starpu_data_handle_t *ptrtile = A->schedopt;
+    ptrtile += ((int64_t)A->lmt) * nn + mm;
+
+    return ptrtile;
+}
+
 #if defined(CHAMELEON_USE_MPI) && defined(HAVE_STARPU_MPI_CACHED_RECEIVE)
 static inline int
 chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n)
@@ -205,4 +218,272 @@ chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n)
 #define RUNTIME_END_ACCESS_DECLARATION          \
     RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION;
 
+#define INSERT_TASK_COMMON_PARAMETERS( _name_, _nbuffer_ )      \
+    struct starpu_data_descr descrs[_nbuffer_];                 \
+    struct starpu_mpi_task_exchange_params params;              \
+    struct cl_##_name_##_args_s *clargs  = NULL;                \
+    struct starpu_codelet       *cl      = &cl_##_name_;        \
+    const char                  *cl_name = #_name_;             \
+    int                          nbdata  = 0;
+
+/**
+ * This section defines the codelet functions to manage MPI cache and data
+ * echanges before and after submitting tasks
+ */
+#if !defined(CHAMELEON_STARPU_USE_INSERT)
+#if !defined(CHAMELEON_USE_MPI)
+
+/**
+ * @brief Empty data structure to mimic the one provided by StarPU when MPI is enabled
+ */
+struct starpu_mpi_task_exchange_params {
+    int do_execute;
+};
+
+static inline void
+starpu_cham_exchange_init_params( const RUNTIME_option_t                 *options,
+                                  struct starpu_mpi_task_exchange_params *params,
+                                  int                                     xrank )
+{
+    params->do_execute = 1;
+    (void)options;
+    (void)xrank;
+}
+
+static inline void
+starpu_cham_exchange_data_before_execution( const RUNTIME_option_t                *options,
+                                            struct starpu_mpi_task_exchange_params params,
+                                            int                                   *nbdata,
+                                            struct starpu_data_descr              *descrs,
+                                            const CHAM_desc_t                     *A,
+                                            int                                    Am,
+                                            int                                    An,
+                                            enum starpu_data_access_mode           mode )
+{
+    descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An );
+    descrs[*nbdata].mode   = mode;
+    (*nbdata)++;
+
+    (void)options;
+    (void)params;
+    return;
+}
+
+#define starpu_cham_task_exchange_data_after_execution( ... ) do {} while(0)
+
+#else
+
+/**
+ * @brief Internal function to initialize the StarPU paramas structure.
+ *
+ * @param[in] options
+ *          The runtime options used to set common informations such as
+ *          communicator, rank, and priority.
+ *
+ * @param[in,out] params
+ *          On entry, the allocated params structure. On exit the fields of the
+ *          structure are initialized.
+ *
+ * @param[in] xrank
+ *          The MPI rank that will execute the task. STARPU_MPI_PER_NODE if all
+ *          nodes excute it.
+ *
+ */
+static inline void
+starpu_cham_exchange_init_params( const RUNTIME_option_t                 *options,
+                                  struct starpu_mpi_task_exchange_params *params,
+                                  int                                     xrank )
+{
+    params->me         = options->sequence->myrank;
+    params->xrank      = xrank;
+    params->priority   = options->priority;
+    params->do_execute = ( xrank == STARPU_MPI_PER_NODE ) || ( xrank == params->me );
+}
+
+/**
+ * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(),
+ * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros
+ * in other runtimes.
+ *
+ * @param[in] options
+ *          The options to parameterize the task
+ *
+ * @param[in] params
+ *          The starpu parameters for the exchange functions. Needs to be
+ *          initialized by starpu_cham_init_exchange_param() function.
+ *
+ * @param[in,out] nbdata
+ *          On entry the number of data already registered in descrs. On exist,
+ *          the counter is updated if the next handle is registered in the
+ *          structure.
+ *
+ * @param[in,out] descrs
+ *          The array of starpu data descriptors (handle + mode). On entry, it
+ *          is allcoated to the maximum number of data for the task, and
+ *          contains the already registered nbdata handles and their associated
+ *          modes. On exit, it is updated with the new handle if needed.
+ *
+ * @param[in] A
+ *          The descriptor in which to find the piece of data
+ *
+ * @param[in] Am
+ *          The row index of the piece of data
+ *
+ * @param[in] An
+ *          The column index of the piece of data
+ *
+ * @param[in] mode
+ *          The access mode
+ *
+ */
+static inline void
+starpu_cham_exchange_data_before_execution( const RUNTIME_option_t                *options,
+                                            struct starpu_mpi_task_exchange_params params,
+                                            int                                   *nbdata,
+                                            struct starpu_data_descr              *descrs,
+                                            const CHAM_desc_t                     *A,
+                                            int                                    Am,
+                                            int                                    An,
+                                            enum starpu_data_access_mode           mode )
+{
+    unsigned              need_submit = 0;
+    starpu_data_handle_t *ptrtile     = chameleon_starpu_data_gethandle( A, Am, An );
+
+    /*
+     * Manage local cache through internal function to avoid the creation of
+     * handles if not necessary
+     */
+    if ( chameleon_desc_islocal( A, Am, An ) ) {
+        need_submit = 1;
+    }
+    else {
+        if ( *ptrtile && ( mode & STARPU_W ) &&
+             starpu_mpi_cached_receive( *ptrtile ) )
+        {
+            need_submit = 1;
+        }
+    }
+    if ( options->forcesub && ( mode & STARPU_MPI_REDUX ) )
+    {
+        need_submit = 1;
+    }
+
+    if ( !need_submit && !params.do_execute ) {
+        return;
+    }
+
+    /*
+     * If we need to submit, let's create the data handle and ask StarPU to perform
+     * the necessary communications
+     */
+    descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An );
+    descrs[*nbdata].mode   = mode;
+
+    starpu_mpi_exchange_data_before_execution(
+        options->sequence->comm, descrs[*nbdata].handle, mode, params );
+
+    (*nbdata)++;
+    return;
+}
+
+/**
+ * @brief Internal wrapper to starpu_mpi_task_exchange_data_after_execution().
+ *
+ * @param[in] options
+ *          The options to get the communicator.
+ *
+ * @param[in] params
+ *          The structure that stores a few parameters initialized by
+ *          starpu_mpi_task_exchange_data_before_execution().
+ *
+ * @param[in] nbdata
+ *          The size of the descr array.
+ *
+ * @param[in] descrs
+ *          The array of the handle with their associated mode. The array is
+ *          initialized in starpu_mpi_task_exchange_data_before_execution().
+ *
+ */
+static inline void
+starpu_cham_task_exchange_data_after_execution( const RUNTIME_option_t                *options,
+                                                struct starpu_mpi_task_exchange_params params,
+                                                int                                    nbdata,
+                                                struct starpu_data_descr              *descrs )
+{
+    starpu_mpi_task_exchange_data_after_execution(
+        options->sequence->comm, descrs, nbdata, params );
+}
+
+#endif
+
+typedef void (*callback_fct_t)(void *);
+
+/**
+ * @brief Internal function to initialize the task common parts.
+ *
+ * @param[in] options
+ *          The runtime options used to set common informations such as
+ *          communicator, rank, and priority.
+ *
+ * @param[in,out] task
+ *          The task for which to complete the initialization
+ *
+ * @param[in] nbdata
+ *          The size of the descr array.
+ *
+ * @param[in] descrs
+ *          The array of the handle with their associated mode. The array is
+ *          initialized in starpu_mpi_task_exchange_data_before_execution().
+ *
+ * @param[in] callback
+ *          The profiling callback function pointe used if profiling is enabled.
+ *
+ */
+static inline void
+starpu_cham_task_set_options( const RUNTIME_option_t   *options,
+                              struct starpu_task       *task,
+                              int                       nbdata,
+                              struct starpu_data_descr *descrs,
+                              callback_fct_t            callback )
+{
+    int i;
+
+    task->priority = options->priority;
+
+#if defined(CHAMELEON_RUNTIME_SYNC)
+    task->synchronous = 1;
+#endif
+
+    /* Callback for profiling information */
+    if ( options->profiling ) {
+        task->callback_func = callback;
+    }
+
+    /* Specific worker id */
+    if ( options->workerid != -1 ) {
+        task->workerid                     = options->workerid;
+        task->execute_on_a_specific_worker = 1;
+    }
+
+    /* Parallel tasks */
+    task->possibly_parallel = options->parallel;
+
+    /* Set the where here */
+    // task->where; /* Do restriction here */
+
+    task->nbuffers = nbdata;
+    for ( i = 0; i < task->nbuffers; i++ ) {
+        enum starpu_data_access_mode mode = descrs[i].mode;
+        assert( descrs[i].handle );
+
+        if ( mode & STARPU_MPI_REDUX ) {
+            mode = STARPU_RW | STARPU_COMMUTE;
+        }
+
+        STARPU_TASK_SET_HANDLE( task, descrs[i].handle, i );
+        STARPU_TASK_SET_MODE( task, mode, i );
+    }
+}
+#endif /* !defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #endif /* _chameleon_starpu_internal_h_ */