diff --git a/compute/zgetrf.c b/compute/zgetrf.c
index 169d038b57ae520073ce244f8b82d902ef1366ff..d69abaecb3ea9de4f21b5b34c5789f44f4b1380f 100644
--- a/compute/zgetrf.c
+++ b/compute/zgetrf.c
@@ -59,7 +59,7 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
 {
     CHAM_context_t             *chamctxt;
     struct chameleon_pzgetrf_s *ws;
-    int lookahead;
+    int lookahead, batch_size;
 
     chamctxt = chameleon_context_self();
     if ( chamctxt == NULL ) {
@@ -114,21 +114,16 @@ CHAMELEON_zgetrf_WS_Alloc( const CHAM_desc_t *A )
         chameleon_cleanenv( allreduce );
     }
 
-    ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", 0 );
-    if ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) {
-        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS2 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS2 value\n" );
-        ws->batch_size_blas2 = CHAMELEON_BATCH_SIZE;
-    }
-    ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", 0 );
-    if ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) {
-        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_BLAS3 must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_BLAS3 value\n" );
-        ws->batch_size_blas3 = CHAMELEON_BATCH_SIZE;
-    }
-    ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", 0 );
-    if ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) {
-        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE_SWAP must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE_SWAP value\n" );
-        ws->batch_size_swap = CHAMELEON_BATCH_SIZE;
+    batch_size = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE", 0 );
+    if ( batch_size > CHAMELEON_BATCH_SIZE ) {
+        chameleon_warning( "CHAMELEON_BATCH_SIZE", "CHAMELEON_GETRF_BATCH_SIZE must be smaller than CHAMELEON_BATCH_SIZE, please recompile with the right CHAMELEON_BATCH_SIZE, or reduce the CHAMELEON_GETRF_BATCH_SIZE value\n" );
     }
+    ws->batch_size_blas2 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS2", batch_size );
+    ws->batch_size_blas2 = ( ws->batch_size_blas2 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas2;
+    ws->batch_size_blas3 = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_BLAS3", batch_size );
+    ws->batch_size_blas3 = ( ws->batch_size_blas3 > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_blas3;
+    ws->batch_size_swap = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_BATCH_SIZE_SWAP", batch_size );
+    ws->batch_size_swap = ( ws->batch_size_swap > CHAMELEON_BATCH_SIZE ) ? CHAMELEON_BATCH_SIZE : ws->batch_size_swap;
 
     ws->ringswitch = chameleon_getenv_get_value_int( "CHAMELEON_GETRF_RINGSWITCH", INT_MAX );
 
diff --git a/include/chameleon/flops.h b/include/chameleon/flops.h
index dacb47113a618dfaf49ab20f9133d4cfb19da720..0635491bfbdc3c93046b5bd9fa13b418ac8b728d 100644
--- a/include/chameleon/flops.h
+++ b/include/chameleon/flops.h
@@ -71,10 +71,12 @@
 #define FMULS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)+1.))
 #define FADDS_TRMM_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
 
-
 #define FMULS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRMM_2((__m), (__n)) : FMULS_TRMM_2((__n), (__m)) )
 #define FADDS_TRMM(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FADDS_TRMM_2((__m), (__n)) : FADDS_TRMM_2((__n), (__m)) )
 
+#define FMULS_TRSM_UNIT_2(__m, __n) (0.5 * (double)(__n) * (double)(__m) * ((double)(__m)-1.))
+#define FMULS_TRSM_UNIT(__side, __m, __n) ( ( (__side) == ChamLeft ) ? FMULS_TRSM_UNIT_2((__m), (__n)) : FMULS_TRSM_UNIT_2((__n), (__m)) )
+
 #define FMULS_TRSM FMULS_TRMM
 #define FADDS_TRSM FADDS_TRMM
 
@@ -236,6 +238,11 @@ static inline double flops_ctrsm( cham_side_t __side, double __m, double __n) {
 static inline double flops_dtrsm( cham_side_t __side, double __m, double __n) { double flops =  (     FMULS_TRSM(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
 static inline double flops_strsm( cham_side_t __side, double __m, double __n) { double flops =  (     FMULS_TRSM(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
 
+static inline double flops_ztrsm_unit( cham_side_t __side, double __m, double __n) { double flops =  (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
+static inline double flops_ctrsm_unit( cham_side_t __side, double __m, double __n) { double flops =  (6. * FMULS_TRSM_UNIT(__side, (__m), (__n)) + 2.0 * FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
+static inline double flops_dtrsm_unit( cham_side_t __side, double __m, double __n) { double flops =  (     FMULS_TRSM_UNIT(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
+static inline double flops_strsm_unit( cham_side_t __side, double __m, double __n) { double flops =  (     FMULS_TRSM_UNIT(__side, (__m), (__n)) +       FADDS_TRSM(__side, (__m), (__n)) ); return flops; }
+
 /*
  * Lapack
  */
@@ -347,10 +354,68 @@ static inline double flops_cgebrd( double __m, double __n) { double flops =  (6.
 static inline double flops_dgebrd( double __m, double __n) { double flops =  (     FMULS_GEBRD((__m), (__n)) +       FADDS_GEBRD((__m), (__n)) ); return flops; }
 static inline double flops_sgebrd( double __m, double __n) { double flops =  (     FMULS_GEBRD((__m), (__n)) +       FADDS_GEBRD((__m), (__n)) ); return flops; }
 
+static inline double flops_zscal( double __m ) { double flops =  (6. * (double)(__m)); return flops; }
+static inline double flops_cscal( double __m ) { double flops =  (6. * (double)(__m)); return flops; }
+static inline double flops_dscal( double __m ) { double flops =  (     (double)(__m)); return flops; }
+static inline double flops_sscal( double __m ) { double flops =  (     (double)(__m)); return flops; }
+
 /*
  * Norms
  */
 #define FMULS_LANGE(__m, __n) ((double)(__m) * (double)(__n))
 #define FADDS_LANGE(__m, __n) ((double)(__m) * (double)(__n))
 
+/*
+ * Getrf with partial pivoting
+ */
+#define FLOPS_GETRF_BLOCKED_OFFDIAG( _prec_ )                           \
+    static inline double flops_##_prec_##getrf_blocked_offdiag( int m, int n, int h, int ib ) \
+    {                                                                   \
+        double flops = 0.;                                              \
+        int kk, nn;                                                     \
+        if ( h == 0 ) {                                                 \
+            return 0.;                                                  \
+        }                                                               \
+        /* scal */                                                      \
+        flops += flops_##_prec_##scal( m );                             \
+        /* blas 3 gemm */                                               \
+        if ( h % ib == 0 ) {                                            \
+            kk = ib;                                                    \
+            nn = n - h;                                                 \
+        }                                                               \
+        /* blas 2 geru */                                               \
+        else {                                                          \
+            kk = 1;                                                     \
+            nn = ib - h % ib;                                           \
+        }                                                               \
+        flops += flops_##_prec_##gemm( m, nn, kk );                     \
+        return flops;                                                   \
+    }
+
+FLOPS_GETRF_BLOCKED_OFFDIAG( z )
+FLOPS_GETRF_BLOCKED_OFFDIAG( c )
+FLOPS_GETRF_BLOCKED_OFFDIAG( d )
+FLOPS_GETRF_BLOCKED_OFFDIAG( s )
+
+/* +1 for the 1/pivot */
+static inline double flops_zgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; }
+static inline double flops_cgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, ib ) + 1. * 6.; }
+static inline double flops_dgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; }
+static inline double flops_sgetrf_blocked_diag( int m, int n, int h, int ib ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, ib ) + 1.; }
+
+static inline double flops_zgetrf_percol_diag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; }
+static inline double flops_cgetrf_percol_diag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m-h, n, h, n ) + 1. * 6.; }
+static inline double flops_dgetrf_percol_diag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; }
+static inline double flops_sgetrf_percol_diag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m-h, n, h, n ) + 1.; }
+
+static inline double flops_zgetrf_percol_offdiag( int m, int n, int h ){ return flops_zgetrf_blocked_offdiag( m, n, h, n ); }
+static inline double flops_cgetrf_percol_offdiag( int m, int n, int h ){ return flops_cgetrf_blocked_offdiag( m, n, h, n ); }
+static inline double flops_dgetrf_percol_offdiag( int m, int n, int h ){ return flops_dgetrf_blocked_offdiag( m, n, h, n ); }
+static inline double flops_sgetrf_percol_offdiag( int m, int n, int h ){ return flops_sgetrf_blocked_offdiag( m, n, h, n ); }
+
+static inline double flops_zgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ztrsm_unit( ChamLeft, ib, n-h ) : 0.; }
+static inline double flops_cgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_ctrsm_unit( ChamLeft, ib, n-h ) : 0.; }
+static inline double flops_dgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_dtrsm_unit( ChamLeft, ib, n-h ) : 0.; }
+static inline double flops_sgetrf_trsm( int m, int n, int h, int ib ){ return ( n - h ) > 0 ? flops_strsm_unit( ChamLeft, ib, n-h ) : 0.; }
+
 #endif /* _flops_h_ */
diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c
index aace97617b754005d6a12c3dd55641006cb94f5a..0cdc6cafaf4060132b0b770c4d532eab085c8e5f 100644
--- a/runtime/starpu/codelets/codelet_zgeadd.c
+++ b/runtime/starpu/codelets/codelet_zgeadd.c
@@ -78,6 +78,7 @@ cl_zgeadd_cuda_func( void *descr[], void *cl_arg )
 CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC )
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
                          cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
@@ -133,7 +134,7 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
                          cham_trans_t trans, int m, int n, int nb,
@@ -156,8 +157,8 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, accessB  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, accessB  );
 
     /*
      * Not involved, let's return
@@ -209,4 +210,4 @@ void INSERT_TASK_zgeadd( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index de400f20da43773e00b960a6fb02cfa640726137..1fe76c9b782c0b52de548f1d851e17250c732dc5 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -127,6 +127,7 @@ CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options,
                               cham_trans_t transA, cham_trans_t transB,
                               int m, int n, int k, int nb,
@@ -297,7 +298,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         0 );
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void __INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                           int xrank, int accessC,
@@ -319,9 +320,9 @@ void __INSERT_TASK_zgemm( const RUNTIME_option_t *options,
      * Register the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, xrank );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -425,4 +426,5 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                                 B, Bm, Bn,
                          beta,  C, Cm, Cn );
 }
-#endif
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_batched.c b/runtime/starpu/codelets/codelet_zgetrf_batched.c
index 4bb70d4b45da5d93cd6f0c1e168109e27d872b84..0ff4ed9854228109928e30ae4b34013338a32a5c 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_batched.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_batched.c
@@ -24,7 +24,7 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
-struct cl_getrf_batched_args_t {
+struct cl_zgetrf_batched_args_s {
     const char              *cl_name;
     int                      tasks_nbr;
     int                      diag;
@@ -36,16 +36,39 @@ struct cl_getrf_batched_args_t {
     struct starpu_data_descr handle_mode[CHAMELEON_BATCH_SIZE];
 };
 
+static inline double flops_zgetrf_percol_batched( int *m, int *n, int h, int t )
+{
+    double flops = 0.;
+    int k;
+    for ( k = 0; k < t; k ++ ) {
+        flops += flops_zgetrf_percol_offdiag( m[k], n[k], h );
+    }
+    return flops;
+}
+
+static inline double flops_zgetrf_blocked_batched( int *m, int *n, int h, int ib, int d, int t )
+{
+    double flops = 0.;
+    int k;
+    if ( d == 1 ) {
+        flops += flops_zgetrf_blocked_diag( m[0]-h, n[0], h, ib );
+    }
+    for ( k = d; k < t; k ++ ) {
+        flops += flops_zgetrf_blocked_offdiag( m[k]-h, n[k], h, ib );
+    }
+    return flops;
+}
+
 #if !defined(CHAMELEON_SIMULATION)
 static void
 cl_zgetrf_panel_offdiag_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
-    struct cl_getrf_batched_args_t *clargs  = (struct cl_getrf_batched_args_t *) cl_arg;
-    cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ];
-    cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ];
-    int                             i, m, n, h, m0, lda;
-    CHAM_tile_t                    *tileA;
+    struct cl_zgetrf_batched_args_s *clargs  = (struct cl_zgetrf_batched_args_s *) cl_arg;
+    cppi_interface_t                *nextpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr ];
+    cppi_interface_t                *prevpiv = (cppi_interface_t*) descr[ clargs->tasks_nbr + 1 ];
+    int                              i, m, n, h, m0, lda;
+    CHAM_tile_t                     *tileA;
 
     nextpiv->h = clargs->h;
     nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag );
@@ -73,19 +96,18 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
                                           void **clargs_ptr,
                                           CHAM_ipiv_t *ipiv )
 {
-    int          task_num   = 0;
-    int          batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2;
-    void (*callback)(void*) = NULL;
-    struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
-    int rankA = A->get_rankof( A, Am, An );
-    if ( rankA != A->myrank ) {
-        return;
-    }
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_offdiag_batched: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    int    task_num   = 0;
+    int    batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_blas2;
+    struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr;
+    int rankA = A->get_rankof( A, Am, An );
+    if ( rankA != A->myrank ) {
+        return;
+    }
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
@@ -93,8 +115,8 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     if ( clargs == NULL ) {
-        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) ) ;
-        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
+        clargs = malloc( sizeof( struct cl_zgetrf_batched_args_s ) ) ;
+        memset( clargs, 0, sizeof( struct cl_zgetrf_batched_args_s ) );
         clargs->tasks_nbr   = 0;
         clargs->h           = h;
         clargs->cl_name     = "zgetrf_panel_offdiag_batched";
@@ -114,39 +136,26 @@ INSERT_TASK_zgetrf_panel_offdiag_batched( const RUNTIME_option_t *options,
                                               A->get_blktile( A, Am, An ) );
 
     if ( clargs->tasks_nbr == batch_size ) {
-        int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
-        int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
-        rt_starpu_insert_task(
-            &cl_zgetrf_panel_offdiag_batched,
-            /* Task codelet arguments */
-            STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
-            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
-            STARPU_PRIORITY,          options->priority,
-            STARPU_CALLBACK,          callback,
-            STARPU_EXECUTE_ON_WORKER, options->workerid,
-            0 );
-
-        /* clargs is freed by starpu. */
-        *clargs_ptr = NULL;
+        INSERT_TASK_zgetrf_panel_offdiag_batched_flush( options, A, An, clargs_ptr, ipiv );
     }
 }
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void
 INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
                                                 CHAM_desc_t *A, int An,
                                                 void **clargs_ptr,
                                                 CHAM_ipiv_t *ipiv )
 {
-    void (*callback)(void*) = NULL;
-    struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
-    int rankA = A->myrank;
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_percol_offdiag_batched: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    void (*callback)(void*) = NULL;
+    struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr;
+    int rankA = A->myrank;
 
     if ( clargs == NULL ) {
         return;
@@ -157,7 +166,7 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zgetrf_panel_offdiag_batched,
         /* Task codelet arguments */
-        STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_zgetrf_batched_args_s),
         STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h   ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h-1 ),
@@ -171,12 +180,75 @@ INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
     *clargs_ptr = NULL;
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void
+INSERT_TASK_zgetrf_panel_offdiag_batched_flush( const RUNTIME_option_t *options,
+                                                CHAM_desc_t *A, int An,
+                                                void **clargs_ptr,
+                                                CHAM_ipiv_t *ipiv )
+{
+    struct cl_zgetrf_batched_args_s *myclargs = *clargs_ptr;
+    int rankA = A->myrank;
+    int k, ret, access_npiv, access_ppiv;
+    struct starpu_task *task;
+
+    if ( myclargs == NULL ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_panel_percol_offdiag_batched, zgetrf_panel_offdiag_batched, zgetrf_batched, myclargs->tasks_nbr + 2 );
+
+    access_npiv = ( myclargs->h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    access_ppiv = ( myclargs->h == 0 )       ? STARPU_NONE : STARPU_R;
+
+    /*
+     * Register the data handles, no exchange needed
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    for ( k = 0; k < myclargs->tasks_nbr; k++ ) {
+        starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW );
+    }
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h-1 ), access_ppiv );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    task->cl_arg      = myclargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_batched_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = flops_zgetrf_percol_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->tasks_nbr );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    /* clargs is freed by starpu. */
+    *clargs_ptr = NULL;
+    (void)clargs;
+    (void)cl_name;
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #if !defined(CHAMELEON_SIMULATION)
 static void
 cl_zgetrf_panel_blocked_batched_cpu_func( void *descr[],
                                           void *cl_arg )
 {
-    struct cl_getrf_batched_args_t *clargs  = ( struct cl_getrf_batched_args_t * ) cl_arg;
+    struct cl_zgetrf_batched_args_s *clargs  = ( struct cl_zgetrf_batched_args_s * ) cl_arg;
     int                            *ipiv;
     cppi_interface_t               *nextpiv = (cppi_interface_t*) descr[clargs->tasks_nbr ];
     cppi_interface_t               *prevpiv = (cppi_interface_t*) descr[clargs->tasks_nbr + 1];
@@ -241,21 +313,19 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
                                           void **clargs_ptr,
                                           CHAM_ipiv_t *ipiv )
 {
-    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws;
-    int          ib         = tmp->ib;
-    int          batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3;
-    int          task_num   = 0;
-    void (*callback)(void*) = NULL;
-    int accessU, access_npiv, access_ipiv, access_ppiv;
-    struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
-    int rankA = A->get_rankof(A, Am, An);
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_panel_blocked_batched: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    struct chameleon_pzgetrf_s *tmp = (struct chameleon_pzgetrf_s *) ws;
+    int ib         = tmp->ib;
+    int batch_size = ( (h % ib) != 0 ) ? tmp->batch_size_blas2 : tmp->batch_size_blas3;
+    int task_num   = 0;
+    struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr;
 
 #if defined ( CHAMELEON_USE_MPI )
+    int rankA = A->get_rankof(A, Am, An);
     if ( ( Am == An ) && ( h % ib == 0 ) && ( h > 0 ) ) {
         starpu_mpi_cache_flush( options->sequence->comm,
                                 RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) );
@@ -277,8 +347,8 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
     CHAMELEON_END_ACCESS_DECLARATION;
 
     if ( clargs == NULL ) {
-        clargs = malloc( sizeof( struct cl_getrf_batched_args_t ) );
-        memset( clargs, 0, sizeof( struct cl_getrf_batched_args_t ) );
+        clargs = malloc( sizeof( struct cl_zgetrf_batched_args_s ) );
+        memset( clargs, 0, sizeof( struct cl_zgetrf_batched_args_s ) );
         clargs->tasks_nbr         = 0;
         clargs->diag              = ( Am == An );
         clargs->ib                = ib;
@@ -300,47 +370,12 @@ INSERT_TASK_zgetrf_panel_blocked_batched( const RUNTIME_option_t *options,
                                               A->get_blktile( A, Am, An ) );
 
     if ( clargs->tasks_nbr == batch_size ) {
-        access_npiv = ( clargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
-        access_ipiv = STARPU_RW;
-        access_ppiv = STARPU_R;
-        accessU     = STARPU_RW;
-        if ( clargs->h == 0 ) {
-            access_ipiv = STARPU_W;
-            access_ppiv = STARPU_NONE;
-            accessU     = STARPU_NONE;
-        }
-        else if ( clargs->h % clargs->ib == 0 ) {
-            accessU = STARPU_R;
-        }
-        else if ( clargs->h % clargs->ib == 1 ) {
-            accessU = STARPU_W;
-        }
-        /* If there isn't a diag task then use offdiag access */
-        if ( clargs->diag == 0 ) {
-            accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
-            access_ipiv = STARPU_NONE;
-        }
-
-        rt_starpu_insert_task(
-            &cl_zgetrf_panel_blocked_batched,
-            /* Task codelet arguments */
-            STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
-            STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
-            access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
-            access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
-            access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
-            accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un ),
-            STARPU_PRIORITY,          options->priority,
-            STARPU_CALLBACK,          callback,
-            STARPU_EXECUTE_ON_WORKER, options->workerid,
-            STARPU_NAME,              clargs->cl_name,
-            0 );
-
-        /* clargs is freed by starpu. */
-        *clargs_ptr = NULL;
+        INSERT_TASK_zgetrf_panel_blocked_batched_flush( options, A, An, U, Um, Un, clargs_ptr, ipiv );
     }
 }
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void
 INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
                                                 CHAM_desc_t *A, int An,
@@ -348,15 +383,15 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
                                                 void **clargs_ptr,
                                                 CHAM_ipiv_t *ipiv )
 {
-    int accessU, access_npiv, access_ipiv, access_ppiv;
-    void (*callback)(void*) = NULL;
-    struct cl_getrf_batched_args_t *clargs = *clargs_ptr;
-    int rankA = A->myrank;
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_panel_blocked_batched: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    int accessU, access_npiv, access_ipiv, access_ppiv;
+    void (*callback)(void*) = NULL;
+    struct cl_zgetrf_batched_args_s *clargs = *clargs_ptr;
+    int rankA = A->myrank;
 
     if ( clargs == NULL ) {
         return;
@@ -386,7 +421,7 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zgetrf_panel_blocked_batched,
         /* Task codelet arguments */
-        STARPU_CL_ARGS,           clargs, sizeof(struct cl_getrf_batched_args_t),
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_zgetrf_batched_args_s),
         STARPU_DATA_MODE_ARRAY,   clargs->handle_mode, clargs->tasks_nbr,
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, clargs->h - 1 ),
@@ -401,3 +436,91 @@ INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
     /* clargs is freed by starpu. */
     *clargs_ptr = NULL;
 }
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void
+INSERT_TASK_zgetrf_panel_blocked_batched_flush( const RUNTIME_option_t *options,
+                                                CHAM_desc_t *A, int An,
+                                                CHAM_desc_t *U, int Um, int Un,
+                                                void **clargs_ptr,
+                                                CHAM_ipiv_t *ipiv )
+{
+    struct cl_zgetrf_batched_args_s *myclargs = *clargs_ptr;
+    int rankA = A->myrank;
+    int accessU, access_npiv, access_ipiv, access_ppiv, k;
+    int ret;
+    struct starpu_task *task;
+
+    if ( myclargs == NULL ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_panel_blocked_batched, zgetrf_panel_blocked_batched, zgetrf_batched, myclargs->tasks_nbr + 4 );
+
+    access_npiv = ( myclargs->h == ipiv->n ) ? STARPU_R : STARPU_REDUX;
+    access_ipiv = STARPU_RW;
+    access_ppiv = STARPU_R;
+    accessU     = STARPU_RW;
+    if ( myclargs->h == 0 ) {
+        access_ipiv = STARPU_W;
+        access_ppiv = STARPU_NONE;
+        accessU     = STARPU_NONE;
+    }
+    else if ( myclargs->h % myclargs->ib == 0 ) {
+        accessU = STARPU_R;
+    }
+    else if ( myclargs->h % myclargs->ib == 1 ) {
+        accessU = STARPU_W;
+    }
+    /* If there isn't a diag task then use offdiag access */
+    if ( myclargs->diag == 0 ) {
+        accessU     = ((myclargs->h%myclargs->ib == 0) && (myclargs->h > 0)) ? STARPU_R : STARPU_NONE;
+        access_ipiv = STARPU_NONE;
+    }
+
+    /*
+     * Register the data handles, exchange needed only for U
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    for ( k = 0; k < myclargs->tasks_nbr; k++ ) {
+        starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW );
+    }
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, myclargs->h-1 ), access_ppiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An),                       access_ipiv );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+                                                  accessU );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    task->cl_arg      = myclargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_batched_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = flops_zgetrf_blocked_batched( myclargs->m, myclargs->n, myclargs->h, myclargs->ib,
+                                                myclargs->diag, myclargs->tasks_nbr );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_panel_blocked_batched", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    /* clargs is freed by starpu. */
+    *clargs_ptr = NULL;
+    (void)clargs;
+    (void)cl_name;
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_blocked.c b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
index 6bbc5f0c4d00574bb53f59538aa179d98cbf4cf7..f1df48f3cc7c3b6d460f859bc064e841bd4f5dc7 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_blocked.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_blocked.c
@@ -22,6 +22,16 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zgetrf_blocked_args_s {
+        int                 m;
+        int                 n;
+        int                 h;
+        int                 m0;
+        int                 ib;
+        RUNTIME_sequence_t *sequence;
+        RUNTIME_request_t  *request;
+};
+
 CHAMELEON_CL_CB( zgetrf_blocked_diag,    cti_handle_get_m(task->handles[0]), 0, 0, M )
 CHAMELEON_CL_CB( zgetrf_blocked_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M )
 CHAMELEON_CL_CB( zgetrf_blocked_trsm,    cti_handle_get_m(task->handles[0]), 0, 0, M )
@@ -29,9 +39,7 @@ CHAMELEON_CL_CB( zgetrf_blocked_trsm,    cti_handle_get_m(task->handles[0]), 0,
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                    m, n, h, m0, ib;
-    RUNTIME_sequence_t    *sequence;
-    RUNTIME_request_t     *request;
+    struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg;
     CHAM_tile_t           *tileA;
     CHAM_tile_t           *tileU;
     int                   *ipiv;
@@ -40,9 +48,6 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
     CHAMELEON_Complex64_t *U   = NULL;
     int                    ldu = -1;;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &ib,
-                                &sequence, &request );
-
     tileA   = cti_interface_get(descr[0]);
     ipiv    = (int *)STARPU_VECTOR_GET_PTR(descr[1]);
     nextpiv = (cppi_interface_t*) descr[2];
@@ -53,10 +58,10 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
         ldu   = tileU->ld;
     }
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " );
     }
-    if ( h < tileA->n ) {
+    if ( clargs->h < tileA->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " );
     }
 
@@ -64,19 +69,19 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
      * Make sure the nextpiv interface store the right information about the
      * column and diagonal row for the reduction
      */
-    nextpiv->h        = h;
+    nextpiv->h        = clargs->h;
     nextpiv->has_diag = 1;
 
     coreblas_kernel_trace( tileA );
-    CORE_zgetrf_panel_diag( m, n, h, m0, ib,
+    CORE_zgetrf_panel_diag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib,
                             CHAM_tile_get_ptr( tileA ), tileA->ld,
                             U, ldu,
                             ipiv, &(nextpiv->pivot), &(prevpiv->pivot) );
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " );
     }
-    if ( h < tileA->n ) {
+    if ( clargs->h < tileA->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " );
     }
 }
@@ -87,21 +92,22 @@ static void cl_zgetrf_blocked_diag_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU( zgetrf_blocked_diag, cl_zgetrf_blocked_diag_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
                                       int m, int n, int h, int m0, int ib,
                                       CHAM_desc_t *A, int Am, int An,
                                       CHAM_desc_t *U, int Um, int Un,
                                       CHAM_ipiv_t *ipiv )
 {
-    struct starpu_codelet *codelet = &cl_zgetrf_blocked_diag;
-    void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL;
-    const char *cl_name = "zgetrf_blocked_diag";
-    int rankA           = A->get_rankof(A, Am, An);
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_diag_callback : NULL;
+    const char *cl_name = "zgetrf_blocked_diag";
+    int rankA           = A->get_rankof(A, Am, An);
 
 #if defined ( CHAMELEON_USE_MPI )
     if ( ( h % ib == 0 ) && ( h > 0 ) ) {
@@ -113,6 +119,17 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
     }
 #endif
 
+    /* Set codelet parameters */
+    struct cl_zgetrf_blocked_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->ib       = ib;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
     int access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
     int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
@@ -139,19 +156,18 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
                                       A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,             &m,                   sizeof(int),
-        STARPU_VALUE,             &n,                   sizeof(int),
-        STARPU_VALUE,             &h,                   sizeof(int),
-        STARPU_VALUE,             &m0,                  sizeof(int),
-        STARPU_VALUE,             &ib,                  sizeof(int),
-        STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
-        STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        &cl_zgetrf_blocked_diag,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s),
+
+        /* Task handles */
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+
+        /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -159,12 +175,97 @@ void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
         0 );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_blocked_diag( const RUNTIME_option_t *options,
+                                      int m, int n, int h, int m0, int ib,
+                                      CHAM_desc_t *A, int Am, int An,
+                                      CHAM_desc_t *U, int Um, int Un,
+                                      CHAM_ipiv_t *ipiv )
+{
+    int ret, access_ipiv, access_npiv, access_ppiv, accessU;
+    struct starpu_task *task;
+    int rankA = A->get_rankof(A, Am, An);
+
+#if defined ( CHAMELEON_USE_MPI )
+    if ( ( h % ib == 0 ) && ( h > 0 ) ) {
+        starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un) );
+    }
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
+#endif
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_diag, zgetrf_blocked_diag, zgetrf_blocked, 5 );
+
+    access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
+    access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+    accessU     = STARPU_RW;
+    if ( h == 0 ) {
+        accessU = STARPU_NONE;
+        /* U can be set after ppiv because they are both none together, so it won't shift the buffers */
+    }
+    else if ( h%ib == 0 ) {
+        accessU = STARPU_R;
+    }
+    else if ( ( h%ib == 1 ) || ( ib == 1 ) ) {
+        accessU = STARPU_W;
+    }
+
+    /*
+     * Register the data handles, no exchange needed
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ),     STARPU_RW );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An),               access_ipiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ),     accessU );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->ib       = ib;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_diag_callback );
+
+    /* Flops */
+    task->flops = flops_zgetrf_blocked_diag( m, n, h, ib );
+
+    /* Refine name */
+    task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                    m, n, h, k, m0, ib;
-    RUNTIME_sequence_t    *sequence;
-    RUNTIME_request_t     *request;
+    struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg;
     CHAM_tile_t           *tileA;
     CHAM_tile_t           *tileU;
     cppi_interface_t      *nextpiv;
@@ -172,8 +273,6 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
     CHAMELEON_Complex64_t *U   = NULL;
     int                    ldu = -1;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &k, &m0, &ib, &sequence, &request );
-
     tileA   = cti_interface_get(descr[0]);
     nextpiv = (cppi_interface_t*) descr[1];
     prevpiv = (cppi_interface_t*) descr[2];
@@ -183,26 +282,26 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
         ldu   = tileU->ld;
     }
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag before call: " );
     }
-    if ( h < tileA->n ) {
+    if ( clargs->h < tileA->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag before call: " );
     }
 
-    nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->h = clargs->h; /* Initialize in case it uses a copy */
     nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
     coreblas_kernel_trace( tileA );
-    CORE_zgetrf_panel_offdiag( m, n, h, m0, ib,
+    CORE_zgetrf_panel_offdiag( clargs->m, clargs->n, clargs->h, clargs->m0, clargs->ib,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
                                U, ldu,
                                &(nextpiv->pivot), &(prevpiv->pivot) );
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv offdiag after call: " );
     }
-    if ( h < tileA->n ) {
+    if ( clargs->h < tileA->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv offdiag after call: " );
     }
 }
@@ -213,23 +312,23 @@ static void cl_zgetrf_blocked_offdiag_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zgetrf_blocked_offdiag, cl_zgetrf_blocked_offdiag_cpu_func)
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
                                          int m, int n, int h, int m0, int ib,
                                          CHAM_desc_t *A, int Am, int An,
                                          CHAM_desc_t *U, int Um, int Un,
                                          CHAM_ipiv_t *ipiv )
 {
-    struct starpu_codelet *codelet = &cl_zgetrf_blocked_offdiag;
-
-    int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
-    int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
-    int accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
-    int rankA       = A->get_rankof(A, Am, An);
 #if !defined(HAVE_STARPU_NONE_NONZERO)
     /* STARPU_NONE can't be equal to 0 */
-    fprintf( stderr, "INSERT_TASK_zgetrf_percol_diag: STARPU_NONE can not be equal to 0\n" );
+    fprintf( stderr, "INSERT_TASK_zgetrf_blocked_diag: STARPU_NONE can not be equal to 0\n" );
     assert( 0 );
 #endif
+    int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+    int accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+    int rankA       = A->get_rankof(A, Am, An);
 
 #if defined ( CHAMELEON_USE_MPI )
     if ( rankA != A->myrank ) {
@@ -244,6 +343,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
     }
 #endif
 
+    /* Set codelet parameters */
+    struct cl_zgetrf_blocked_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->ib       = ib;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_blocked_offdiag";
 
@@ -260,19 +370,17 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
                                       A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,             &m,                   sizeof(int),
-        STARPU_VALUE,             &n,                   sizeof(int),
-        STARPU_VALUE,             &h,                   sizeof(int),
-        STARPU_VALUE,             &An,                  sizeof(int),
-        STARPU_VALUE,             &m0,                  sizeof(int),
-        STARPU_VALUE,             &ib,                  sizeof(int),
-        STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
-        STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
+        &cl_zgetrf_blocked_offdiag,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s),
+
+        /* Task handles */
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
         accessU,                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+
+        /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -280,19 +388,96 @@ void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
         0 );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_blocked_offdiag( const RUNTIME_option_t *options,
+                                         int m, int n, int h, int m0, int ib,
+                                         CHAM_desc_t *A, int Am, int An,
+                                         CHAM_desc_t *U, int Um, int Un,
+                                         CHAM_ipiv_t *ipiv )
+{
+    int ret;
+    struct starpu_task *task;
+    int rankA       = A->get_rankof(A, Am, An);
+    int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    int access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+    int accessU     = ((h%ib == 0) && (h > 0)) ? STARPU_R : STARPU_NONE;
+
+#if defined ( CHAMELEON_USE_MPI )
+    if ( rankA != A->myrank ) {
+        if ( ( accessU != STARPU_NONE ) &&
+             ( A->myrank == A->get_rankof( A, An, An ) ) )
+        {
+            starpu_mpi_get_data_on_node_detached( options->sequence->comm,
+                                                  RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+                                                  rankA, NULL, NULL );
+        }
+        return;
+    }
+#endif
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_offdiag, zgetrf_blocked_offdiag, zgetrf_blocked, 4 );
+
+    /*
+     * Register the data handles, exchange needed only for U
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ),     STARPU_RW );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RTBLKADDR( U, ChamComplexDouble, Um, Un ),
+                                                  accessU );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->ib       = ib;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_offdiag_callback );
+
+    /* Flops */
+    task->flops = flops_zgetrf_blocked_offdiag( m, n, h, ib );
+
+    /* Refine name */
+    task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_blocked_offdiag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #if !defined(CHAMELEON_SIMULATION)
 static const CHAMELEON_Complex64_t zone = (CHAMELEON_Complex64_t)1.0;
 
 static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg)
 {
-    int                    m, n, h, ib;
+    struct cl_zgetrf_blocked_args_s *clargs = (struct cl_zgetrf_blocked_args_s *)cl_arg;
     CHAM_tile_t           *tileU;
     cppi_interface_t      *prevpiv;
     CHAMELEON_Complex64_t *U;
     int                    ldu;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &ib );
-
     tileU   = cti_interface_get(descr[0]);
     prevpiv = (cppi_interface_t*) descr[1];
     U       = CHAM_tile_get_ptr( tileU );
@@ -301,16 +486,16 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg)
     coreblas_kernel_trace( tileU );
 
     /* Copy the final max line of the block and solve */
-    cblas_zcopy( n, prevpiv->pivot.pivrow, 1,
-                    U + m - 1, ldu );
+    cblas_zcopy( clargs->n, prevpiv->pivot.pivrow, 1,
+                            U +  clargs->m - 1, ldu );
 
-    if ( (n-h) > 0 ) {
+    if ( ( clargs->n - clargs->h ) > 0 ) {
         cblas_ztrsm( CblasColMajor,
                      CblasLeft, CblasLower,
                      CblasNoTrans, CblasUnit,
-                     ib, n - h,
-                     CBLAS_SADDR(zone), U + (h-ib) * ldu, ldu,
-                                        U +  h     * ldu, ldu );
+                     clargs->ib, clargs->n - clargs->h,
+                     CBLAS_SADDR(zone), U + (clargs->h-clargs->ib) * ldu, ldu,
+                                        U +  clargs->h             * ldu, ldu );
     }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -320,13 +505,13 @@ static void cl_zgetrf_blocked_trsm_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zgetrf_blocked_trsm, cl_zgetrf_blocked_trsm_cpu_func)
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
                                       int m, int n, int h, int ib,
                                       CHAM_desc_t *U, int Um, int Un,
                                       CHAM_ipiv_t *ipiv )
 {
-    struct starpu_codelet *codelet = &cl_zgetrf_blocked_trsm;
-
     void (*callback)(void*) = options->profiling ? cl_zgetrf_blocked_trsm_callback : NULL;
     const char *cl_name = "zgetrf_blocked_trsm";
     int rankU = U->get_rankof(U, Um, Un);
@@ -344,17 +529,86 @@ void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
         return;
     }
 
+    /* Set codelet parameters */
+    struct cl_zgetrf_blocked_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m  = m;
+    clargs->n  = n;
+    clargs->h  = h;
+    clargs->ib = ib;
+
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,             &m,                   sizeof(int),
-        STARPU_VALUE,             &n,                   sizeof(int),
-        STARPU_VALUE,             &h,                   sizeof(int),
-        STARPU_VALUE,             &ib,                  sizeof(int),
+        &cl_zgetrf_blocked_trsm,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_blocked_args_s),
+
+        /* Task handles */
         STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
         STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ),
+
+        /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         STARPU_NAME,              cl_name,
         0 );
 }
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_blocked_trsm( const RUNTIME_option_t *options,
+                                      int m, int n, int h, int ib,
+                                      CHAM_desc_t *U, int Um, int Un,
+                                      CHAM_ipiv_t *ipiv )
+{
+    int ret;
+    struct starpu_task *task;
+    int rankU = U->get_rankof(U, Um, Un);
+
+    if ( U->myrank != rankU ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_blocked_trsm, zgetrf_blocked_trsm, zgetrf_blocked, 2 );
+
+    /*
+     * Register the data handles, no exchange needed
+     */
+    starpu_cham_exchange_init_params( options, &params, rankU );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ),     STARPU_RW );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankU, Un, h-1 ), STARPU_R  );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zgetrf_blocked_args_s ) );
+    clargs->m  = m;
+    clargs->n  = n;
+    clargs->h  = h;
+    clargs->ib = ib;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_blocked_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_blocked_trsm_callback );
+
+    /* Flops */
+    task->flops = flops_zgetrf_trsm( m, n, h, ib );
+
+    /* Refine name */
+    task->name = chameleon_codelet_name( cl_name, 1, U->get_blktile( U, Um, Un ) );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_blocked_diag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
index 28f93240c795ddf781910bcc70dbc5fdd531dc22..bedb371858a13eec4cd30511310469ae037db9a4 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_nopiv.c
@@ -26,28 +26,31 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zgetrf_nopiv_args_s {
+    int                 m;
+    int                 n;
+    int                 ib;
+    int                 iinfo;
+    RUNTIME_sequence_t *sequence;
+    RUNTIME_request_t  *request;
+};
+
 /*
  * Codelet CPU
  */
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
 {
-    int m;
-    int n;
-    int ib;
+    struct cl_zgetrf_nopiv_args_s *clargs = (struct cl_zgetrf_nopiv_args_s *)cl_arg;
     CHAM_tile_t *tileA;
-    int iinfo;
-    RUNTIME_sequence_t *sequence;
-    RUNTIME_request_t *request;
     int info = 0;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args(cl_arg, &m, &n, &ib, &iinfo, &sequence, &request);
-    TCORE_zgetrf_nopiv(m, n, ib, tileA, &info);
+    TCORE_zgetrf_nopiv( clargs->m, clargs->n, clargs->ib, tileA, &info );
 
-    if ( (sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
-        RUNTIME_sequence_flush( NULL, sequence, request, iinfo+info );
+    if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
+        RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info );
     }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -57,30 +60,121 @@ static void cl_zgetrf_nopiv_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zgetrf_nopiv, cl_zgetrf_nopiv_cpu_func)
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options,
                               int m, int n, int ib, int nb,
                               const CHAM_desc_t *A, int Am, int An,
                               int iinfo)
 {
-    (void)nb;
-    struct starpu_codelet *codelet = &cl_zgetrf_nopiv;
-    void (*callback)(void*) = options->profiling ? cl_zgetrf_nopiv_callback : NULL;
+    void (*callback)(void*);
+    struct cl_zgetrf_nopiv_args_s *clargs  = NULL;
+    int                            exec    = 0;
+    const char                    *cl_name = "zgetrf_nopiv";
 
+    /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    /* Set codelet parameters */
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zgetrf_nopiv_args_s ) );
+        clargs->m        = m;
+        clargs->n        = n;
+        clargs->ib       = ib;
+        clargs->iinfo    = iinfo;
+        clargs->sequence = options->sequence;
+        clargs->request  = options->request;
+    }
+
+    /* Callback for profiling information */
+    callback = options->profiling ? cl_zgetrf_nopiv_callback : NULL;
+
+    /* Refine name */
+    cl_name = chameleon_codelet_name( cl_name, 1,
+                                      A->get_blktile( A, Am, An ) );
+
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &m,                         sizeof(int),
-        STARPU_VALUE,    &n,                         sizeof(int),
-        STARPU_VALUE,    &ib,                        sizeof(int),
-        STARPU_RW,        RTBLKADDR(A, ChamComplexDouble, Am, An),
-        STARPU_VALUE,    &iinfo,                     sizeof(int),
-        STARPU_VALUE,    &(options->sequence),       sizeof(RUNTIME_sequence_t*),
-        STARPU_VALUE,    &(options->request),        sizeof(RUNTIME_request_t*),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
+        &cl_zgetrf_nopiv,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_zgetrf_nopiv_args_s),
+        STARPU_RW,                RTBLKADDR(A, ChamComplexDouble, Am, An),
+
+        /* Common task arguments */
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_NAME,              cl_name,
         0 );
+
+    (void)nb;
 }
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_nopiv(const RUNTIME_option_t *options,
+                              int m, int n, int ib, int nb,
+                              const CHAM_desc_t *A, int Am, int An,
+                              int iinfo)
+{
+    INSERT_TASK_COMMON_PARAMETERS( zgetrf_nopiv, 1 );
+
+    /*
+     * Register the data handles and initialize exchanges if needed
+     */
+    starpu_cham_exchange_init_params( options, &params, A->get_rankof( A, Am, An ) );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_RW );
+
+    /*
+     * Not involved, let's return
+     */
+    if ( nbdata == 0 ) {
+        return;
+    }
+
+    if ( params.do_execute )
+    {
+        int ret;
+        struct starpu_task *task = starpu_task_create();
+        task->cl = cl;
+
+        /* Set codelet parameters */
+        clargs = malloc( sizeof( struct cl_zgetrf_nopiv_args_s ) );
+        clargs->m        = m;
+        clargs->n        = n;
+        clargs->ib       = ib;
+        clargs->iinfo    = iinfo;
+        clargs->sequence = options->sequence;
+        clargs->request  = options->request;
+
+        task->cl_arg      = clargs;
+        task->cl_arg_size = sizeof( struct cl_zgetrf_nopiv_args_s );
+        task->cl_arg_free = 1;
+
+        /* Set common parameters */
+        starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_nopiv_callback );
+
+        /* Flops */
+        task->flops = flops_zgetrf( m, n );
+
+        /* Refine name */
+        task->name = chameleon_codelet_name( cl_name, 1,
+                                             A->get_blktile( A, Am, An ) );
+
+        ret = starpu_task_submit( task );
+        if ( ret == -ENODEV ) {
+            task->destroy = 0;
+            starpu_task_destroy( task );
+            chameleon_error( "INSERT_TASK_zgetrf_nopiv", "Failed to submit the task to StarPU" );
+            return;
+        }
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    (void)nb;
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zgetrf_percol.c b/runtime/starpu/codelets/codelet_zgetrf_percol.c
index 1ee38e8a12f0bb095ca65636b7569f1ee98c20af..9a0ec048b78b68569974267edb5c62aa97ce65d2 100644
--- a/runtime/starpu/codelets/codelet_zgetrf_percol.c
+++ b/runtime/starpu/codelets/codelet_zgetrf_percol.c
@@ -22,31 +22,36 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zgetrf_percol_args_s {
+        int                 m;
+        int                 n;
+        int                 h;
+        int                 m0;
+        RUNTIME_sequence_t *sequence;
+        RUNTIME_request_t  *request;
+};
+
 CHAMELEON_CL_CB( zgetrf_percol_diag,    cti_handle_get_m(task->handles[0]), 0, 0, M )
 CHAMELEON_CL_CB( zgetrf_percol_offdiag, cti_handle_get_m(task->handles[0]), 0, 0, M )
 
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                 m, n, h, m0;
-    RUNTIME_sequence_t *sequence;
-    RUNTIME_request_t  *request;
+    struct cl_zgetrf_percol_args_s *clargs = (struct cl_zgetrf_percol_args_s *)cl_arg;
     CHAM_tile_t        *tileA;
     int                *ipiv;
     cppi_interface_t   *nextpiv;
     cppi_interface_t   *prevpiv;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request );
-
     tileA   = cti_interface_get(descr[0]);
     ipiv    = (int *)STARPU_VECTOR_GET_PTR(descr[1]);
     nextpiv = (cppi_interface_t*) descr[2];
     prevpiv = (cppi_interface_t*) descr[3];
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv before call: " );
     }
-    if ( h < tileA->n ) {
+    if ( clargs->h < tileA->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv before call: " );
     }
 
@@ -54,18 +59,18 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg)
      * Make sure the nextpiv interface store the right information about the
      * column and diagonal row for the reduction
      */
-    nextpiv->h        = h;
+    nextpiv->h        = clargs->h;
     nextpiv->has_diag = 1;
 
-    CORE_zgetrf_panel_diag( m, n, h, m0, tileA->n,
+    CORE_zgetrf_panel_diag( clargs->m, clargs->n, clargs->h, clargs->m0, tileA->n,
                             CHAM_tile_get_ptr( tileA ), tileA->ld,
                             NULL, -1,
                             ipiv, &(nextpiv->pivot), &(prevpiv->pivot) );
 
-    if ( h > 0 ) {
+    if ( clargs->h > 0 ) {
         cppi_display_dbg( prevpiv, stderr, "Prevpiv after call: " );
     }
-    if ( h < n ) {
+    if ( clargs->h < clargs->n ) {
         cppi_display_dbg( nextpiv, stderr, "Nextpiv after call: " );
     }
 }
@@ -76,12 +81,13 @@ static void cl_zgetrf_percol_diag_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU( zgetrf_percol_diag, cl_zgetrf_percol_diag_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
                                      int m, int n, int h, int m0,
                                      CHAM_desc_t *A, int Am, int An,
                                      CHAM_ipiv_t *ipiv )
 {
-    struct starpu_codelet *codelet = &cl_zgetrf_percol_diag;
     void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_diag_callback : NULL;
     const char *cl_name = "zgetrf_percol_diag";
     int rankA           = A->get_rankof(A, Am, An);
@@ -105,21 +111,31 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
     CHAMELEON_ACCESS_RW( A, Am, An );
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    /* Set codelet parameters */
+    struct cl_zgetrf_percol_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
     /* Refine name */
     cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,             &m,                   sizeof(int),
-        STARPU_VALUE,             &n,                   sizeof(int),
-        STARPU_VALUE,             &h,                   sizeof(int),
-        STARPU_VALUE,             &m0,                  sizeof(int),
-        STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
-        STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        &cl_zgetrf_percol_diag,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_percol_args_s),
+
+        /* Task handles */
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         access_ipiv,              RUNTIME_ipiv_getaddr( ipiv, An ),
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+
+        /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
@@ -127,26 +143,89 @@ void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
         0 );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_percol_diag( const RUNTIME_option_t *options,
+                                     int m, int n, int h, int m0,
+                                     CHAM_desc_t *A, int Am, int An,
+                                     CHAM_ipiv_t *ipiv )
+{
+    int ret, access_ipiv, access_npiv, access_ppiv;
+    struct starpu_task *task;
+    int rankA = A->get_rankof(A, Am, An);
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_percol_diag, zgetrf_percol_diag, zgetrf_percol, 4 );
+
+    access_ipiv = ( h == 0 )       ? STARPU_W    : STARPU_RW;
+    access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+
+    /*
+     * Register the data handles, no exchange needed
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ),     STARPU_RW );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_ipiv_getaddr( ipiv, An),               access_ipiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_percol_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_diag_callback );
+
+    /* Flops */
+    task->flops = flops_zgetrf_percol_diag( m, n, h );
+
+    /* Refine name */
+    task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg)
 {
-    int                 m, n, h, m0;
-    RUNTIME_sequence_t *sequence;
-    RUNTIME_request_t  *request;
+    struct cl_zgetrf_percol_args_s *clargs = (struct cl_zgetrf_percol_args_s *)cl_arg;
     CHAM_tile_t        *tileA;
     cppi_interface_t   *nextpiv;
     cppi_interface_t   *prevpiv;
 
-    starpu_codelet_unpack_args( cl_arg, &m, &n, &h, &m0, &sequence, &request );
-
     tileA   = cti_interface_get(descr[0]);
     nextpiv = (cppi_interface_t*) descr[1];
     prevpiv = (cppi_interface_t*) descr[2];
 
-    nextpiv->h = h; /* Initialize in case it uses a copy */
+    nextpiv->h = clargs->h; /* Initialize in case it uses a copy */
     nextpiv->has_diag = chameleon_max( -1, nextpiv->has_diag);
 
-    CORE_zgetrf_panel_offdiag( m, n, h, m0, tileA->n,
+    CORE_zgetrf_panel_offdiag( clargs->m, clargs->n, clargs->h, clargs->m0, tileA->n,
                                CHAM_tile_get_ptr(tileA), tileA->ld,
                                NULL, -1,
                                &(nextpiv->pivot), &(prevpiv->pivot) );
@@ -158,13 +237,13 @@ static void cl_zgetrf_percol_offdiag_cpu_func(void *descr[], void *cl_arg)
  */
 CODELETS_CPU(zgetrf_percol_offdiag, cl_zgetrf_percol_offdiag_cpu_func)
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
                                         int m, int n, int h, int m0,
                                         CHAM_desc_t *A, int Am, int An,
                                         CHAM_ipiv_t *ipiv )
 {
-    struct starpu_codelet *codelet = &cl_zgetrf_percol_offdiag;
-
     void (*callback)(void*) = options->profiling ? cl_zgetrf_percol_offdiag_callback : NULL;
     const char *cl_name = "zgetrf_percol_offdiag";
     int access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
@@ -185,23 +264,98 @@ void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
     CHAMELEON_ACCESS_RW( A, Am, An );
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    /* Set codelet parameters */
+    struct cl_zgetrf_percol_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
     /* Refine name */
     cl_name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
 
     rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,             &m,                   sizeof(int),
-        STARPU_VALUE,             &n,                   sizeof(int),
-        STARPU_VALUE,             &h,                   sizeof(int),
-        STARPU_VALUE,             &m0,                  sizeof(int),
-        STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t *),
-        STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t *),
+        &cl_zgetrf_percol_offdiag,
+        /* Task codelet arguments */
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgetrf_percol_args_s),
+
+        /* Task handles */
         STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         access_npiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h   ),
         access_ppiv,              RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ),
+
+        /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         STARPU_NAME,              cl_name,
         0 );
 }
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zgetrf_percol_offdiag( const RUNTIME_option_t *options,
+                                        int m, int n, int h, int m0,
+                                        CHAM_desc_t *A, int Am, int An,
+                                        CHAM_ipiv_t *ipiv )
+{
+    int ret, access_npiv, access_ppiv;
+    struct starpu_task *task;
+    int rankA = A->get_rankof(A, Am, An);
+
+    if ( rankA != A->myrank ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zgetrf_percol_offdiag, zgetrf_percol_offdiag, zgetrf_percol, 3 );
+
+    access_npiv = ( h == ipiv->n ) ? STARPU_R    : STARPU_REDUX;
+    access_ppiv = ( h == 0 )       ? STARPU_NONE : STARPU_R;
+
+    /*
+     * Register the data handles, no exchange needed
+     */
+    starpu_cham_exchange_init_params( options, &params, rankA );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ),     STARPU_RW );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h ),   access_npiv );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_pivot_getaddr( ipiv, rankA, An, h-1 ), access_ppiv );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zgetrf_percol_args_s ) );
+    clargs->m        = m;
+    clargs->n        = n;
+    clargs->h        = h;
+    clargs->m0       = m0;
+    clargs->sequence = options->sequence;
+    clargs->request  = options->request;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zgetrf_percol_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, cl_zgetrf_percol_offdiag_callback );
+
+    /* Flops */
+    task->flops = flops_zgetrf_percol_offdiag( m, n, h );
+
+    /* Refine name */
+    task->name = chameleon_codelet_name( cl_name, 1, A->get_blktile( A, Am, An ) );
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zgetrf_percol_diag", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c
index 2ce1c3a5a5fe2501683f53856e632b5bd974c098..6584684a40f43cb40417458dc69d4591aa8730f4 100644
--- a/runtime/starpu/codelets/codelet_zhemm.c
+++ b/runtime/starpu/codelets/codelet_zhemm.c
@@ -126,6 +126,7 @@ CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
                               int m, int n, int nb,
@@ -274,7 +275,7 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
         0 );
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void __INSERT_TASK_zhemm( const RUNTIME_option_t *options,
                           cham_side_t side, cham_uplo_t uplo,
@@ -295,9 +296,9 @@ void __INSERT_TASK_zhemm( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, xrank );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -391,4 +392,5 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options,
                                 B, Bm, Bn,
                          beta,  C, Cm, Cn );
 }
-#endif
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c
index 04c5354996e62e3f70b2ed908124ae0a88eeec69..455427bc90ad26d5a52a322d0617256e80964824 100644
--- a/runtime/starpu/codelets/codelet_zher2k.c
+++ b/runtime/starpu/codelets/codelet_zher2k.c
@@ -108,6 +108,7 @@ CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
                          int n, int k, int nb,
@@ -175,7 +176,7 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
@@ -200,9 +201,9 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -256,4 +257,4 @@ void INSERT_TASK_zher2k( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 7ea5448cad03886644367b3b641f2ffaa9b665e8..beba72bf4cb5cfd5c6feda024c71fd9dbed7db58 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -110,6 +110,7 @@ CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
@@ -175,7 +176,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
@@ -199,8 +200,8 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -253,4 +254,4 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zipiv_allreduce.c b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
index a3d4d21441da1c09f9eb2f6b6983deb95550538c..e79a1841d491524d67ffd3c398bba8efe5706690 100644
--- a/runtime/starpu/codelets/codelet_zipiv_allreduce.c
+++ b/runtime/starpu/codelets/codelet_zipiv_allreduce.c
@@ -20,7 +20,7 @@
 #include "runtime_codelet_z.h"
 
 #if defined(CHAMELEON_USE_MPI)
-struct cl_redux_args_t {
+struct cl_redux_args_s {
     int h;
     int n;
 };
@@ -55,7 +55,11 @@ zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me,
          ( cppi_me->has_diag  == -1 ) )
     {
         cblas_zcopy( n, nextpiv_src->diagrow, 1, nextpiv_me->diagrow, 1 );
-        assert( cppi_src->arraysize == sizeof(CHAMELEON_Complex64_t) * n );
+        /*
+         * The interface of the pivot is registered once in starpu so
+         * the arraysize is not always correct
+         */
+        assert( cppi_src->arraysize >= sizeof(CHAMELEON_Complex64_t) * n );
         cppi_me->has_diag = 1;
     }
 
@@ -65,7 +69,7 @@ zipiv_allreduce_cpu_func( cppi_interface_t *cppi_me,
 static void
 cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
 {
-    struct cl_redux_args_t *clargs   = (struct cl_redux_args_t *) cl_arg;
+    struct cl_redux_args_s *clargs   = (struct cl_redux_args_s *) cl_arg;
     cppi_interface_t       *cppi_me  = ((cppi_interface_t *) descr[0]);
     cppi_interface_t       *cppi_src = ((cppi_interface_t *) descr[1]);
     zipiv_allreduce_cpu_func(  cppi_me, cppi_src, clargs->h, clargs->n );
@@ -73,6 +77,8 @@ cl_zipiv_allreduce_cpu_func( void *descr[], void *cl_arg )
 
 CODELETS_CPU( zipiv_allreduce, cl_zipiv_allreduce_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT) /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 static void
 INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options,
                                   CHAM_ipiv_t            *ipiv,
@@ -98,14 +104,14 @@ INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options,
                                   int                     h,
                                   int                     n )
 {
-    struct cl_redux_args_t *clargs;
-    clargs    = malloc( sizeof( struct cl_redux_args_t ) );
+    struct cl_redux_args_s *clargs;
+    clargs    = malloc( sizeof( struct cl_redux_args_s ) );
     clargs->h = h;
     clargs->n = n;
 
     rt_starpu_insert_task(
         &cl_zipiv_allreduce,
-        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_t),
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_s),
         STARPU_RW,                RUNTIME_pivot_getaddr( ipiv, me,  k, h ),
         STARPU_R,                 RUNTIME_pivot_getaddr( ipiv, src, k, h ),
         STARPU_EXECUTE_ON_NODE,   me,
@@ -115,6 +121,83 @@ INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options,
     starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+static void
+INSERT_TASK_zipiv_allreduce_send( const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     me,
+                                  int                     dst,
+                                  int                     k,
+                                  int                     h )
+{
+    INSERT_TASK_COMMON_PARAMETERS_CLNULL( zipiv_allreduce_send, 1 )
+
+    starpu_cham_exchange_init_params( options, &params, dst );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_pivot_getaddr( ipiv, me, k, h ),
+                                                  STARPU_R );
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+    (void)cl;
+    (void)cl_name;
+}
+
+static void
+INSERT_TASK_zipiv_allreduce_recv( const RUNTIME_option_t *options,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     me,
+                                  int                     src,
+                                  int                     k,
+                                  int                     h,
+                                  int                     n )
+{
+    int ret;
+    struct starpu_task *task;
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zipiv_allreduce_recv, zipiv_allreduce, redux, 2 )
+
+    starpu_cham_exchange_init_params( options, &params, me );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_pivot_getaddr( ipiv, me,  k, h ),
+                                                  STARPU_RW );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_pivot_getaddr( ipiv, src, k, h ),
+                                                  STARPU_R );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs    = malloc( sizeof( struct cl_redux_args_s ) );
+    clargs->h = h;
+    clargs->n = n;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_redux_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = 0.;
+
+    /* Refine name */
+    task->name = cl_name;
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zipiv_allreduce", "Failed to submit the task to StarPU" );
+        return;
+    }
+
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+    starpu_mpi_cache_flush( options->sequence->comm, RUNTIME_pivot_getaddr( ipiv, src, k, h ) );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 static void
 zipiv_allreduce_chameleon_starpu_task( const RUNTIME_option_t *options,
                                        CHAM_desc_t            *A,
diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c
index 7478942fe3fe48b1bceeb0e9474243ac016cce8f..b9ddb9a8c83659dea3a4bddc6abea31180a57cf6 100644
--- a/runtime/starpu/codelets/codelet_zlascal.c
+++ b/runtime/starpu/codelets/codelet_zlascal.c
@@ -50,6 +50,7 @@ cl_zlascal_cpu_func( void *descr[], void *cl_arg )
 CODELETS_CPU( zlascal, cl_zlascal_cpu_func )
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
                           cham_uplo_t uplo,
                           int m, int n, int nb,
@@ -104,7 +105,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
                           cham_uplo_t uplo,
@@ -127,7 +128,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, A->get_rankof( A, Am, An ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_RW );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_RW );
 
     /*
      * Not involved, let's return
@@ -177,4 +178,4 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
index 2c77f07f3d3e41d0df3402aff447d0ca55e06f2d..81c28d92f05d6c23e85e743b8402b79db31815b1 100644
--- a/runtime/starpu/codelets/codelet_zlaswp.c
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -20,19 +20,23 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
+struct cl_zlaswp_args_s {
+    int m0;
+    int k;
+};
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg )
 {
-    int          m0, k, *perm;
+    struct cl_zlaswp_args_s *clargs = (struct cl_zlaswp_args_s *)cl_arg;
+    int         *perm;
     CHAM_tile_t *A, *B;
 
-    starpu_codelet_unpack_args( cl_arg, &m0, &k );
-
     perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
     A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
     B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
 
-    TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm );
+    TCORE_zlaswp_get( clargs->m0, A->m, A->n, clargs->k, A, B, perm );
 }
 #endif
 
@@ -41,6 +45,8 @@ static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg )
  */
 CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
                              int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
@@ -52,12 +58,16 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
         return;
     }
 
+    struct cl_zlaswp_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zlaswp_args_s ) );
+    clargs->m0 = m0;
+    clargs->k  = k;
+
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL;
 
     rt_starpu_insert_task(
         codelet,
-        STARPU_VALUE,               &m0, sizeof(int),
-        STARPU_VALUE,               &k,  sizeof(int),
+        STARPU_CL_ARGS,             clargs, sizeof(struct cl_zlaswp_args_s),
         STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
         STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, Am, An),
         STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
@@ -67,19 +77,77 @@ void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
         0 );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    int ret;
+    struct starpu_task *task;
+
+    if ( A->get_rankof( A, Am, An) != A->myrank ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_get, zlaswp_get, zlaswp, 3);
+
+    /*
+     * Register the data handles, might need to receive perm and invp
+     */
+    starpu_cham_exchange_init_params( options, &params, U->get_rankof( U, Um, Un ) );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_perm_getaddr( ipiv, ipivk ),
+                                                  STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ),
+                                STARPU_RW | STARPU_COMMUTE );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    clargs = malloc( sizeof( struct cl_zlaswp_args_s ) );
+    clargs->m0 = m0;
+    clargs->k  = k;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zlaswp_args_s );
+    task->cl_arg_free = 1;
+
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = 0.;
+
+    /* Refine name */
+    task->name = cl_name;
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zlaswp_get", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg )
 {
-    int          m0, k, *invp;
+    struct cl_zlaswp_args_s *clargs = (struct cl_zlaswp_args_s *)cl_arg;
+    int         *invp;
     CHAM_tile_t *A, *B;
 
-    starpu_codelet_unpack_args( cl_arg, &m0, &k );
-
     invp = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
     A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
     B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
 
-    TCORE_zlaswp_set( m0, B->m, B->n, k, A, B, invp );
+    TCORE_zlaswp_set( clargs->m0, B->m, B->n, clargs->k, A, B, invp );
 }
 #endif
 
@@ -88,6 +156,7 @@ static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg )
  */
 CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
 void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
                              int m0, int k,
                              const CHAM_ipiv_t *ipiv, int ipivk,
@@ -95,16 +164,20 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
                              const CHAM_desc_t *B, int Bm, int Bn )
 {
     struct starpu_codelet *codelet = &cl_zlaswp_set;
-    if ( A->get_rankof( B, Bm, Bn) != A->myrank ) {
+    if ( B->get_rankof( B, Bm, Bn) != A->myrank ) {
         return;
     }
 
+    struct cl_zlaswp_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_zlaswp_args_s ) );
+    clargs->m0 = m0;
+    clargs->k  = k;
+
     //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL;
 
     rt_starpu_insert_task(
         codelet,
-        STARPU_VALUE,             &m0, sizeof(int),
-        STARPU_VALUE,             &k,  sizeof(int),
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_zlaswp_args_s),
         STARPU_R,                 RUNTIME_invp_getaddr( ipiv, ipivk ),
         STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
         STARPU_RW,                RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
@@ -113,3 +186,60 @@ void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
         0 );
 }
+#else
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    int ret;
+    struct starpu_task *task;
+
+    if ( B->get_rankof( B, Bm, Bn) != A->myrank ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zlaswp_set, zlaswp_set, zlaswp, 3);
+
+    /*
+     * Register the data handles, might need to receive perm and invp
+     */
+    starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_invp_getaddr( ipiv, ipivk ),
+                                                  STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( B, ChamComplexDouble, Bm, Bn ), STARPU_RW );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_zlaswp_args_s ) );
+    clargs->m0 = m0;
+    clargs->k  = k;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_zlaswp_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = 0.;
+
+    /* Refine name */
+    task->name = cl_name;
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zlaswp_set", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+}
+#endif
diff --git a/runtime/starpu/codelets/codelet_zlaswp_batched.c b/runtime/starpu/codelets/codelet_zlaswp_batched.c
index f43a68947a9798d15377bb460a54998f208da898..8cc2a3adc593c698f3d79163781f44bd59b92d6e 100644
--- a/runtime/starpu/codelets/codelet_zlaswp_batched.c
+++ b/runtime/starpu/codelets/codelet_zlaswp_batched.c
@@ -18,7 +18,7 @@
 #include "chameleon_starpu_internal.h"
 #include "runtime_codelet_z.h"
 
-struct cl_laswp_batched_args_t {
+struct cl_zlaswp_batched_args_s {
     int                      tasks_nbr;
     int                      minmn;
     int                      m0[CHAMELEON_BATCH_SIZE];
@@ -32,7 +32,7 @@ cl_zlaswp_batched_cpu_func( void *descr[],
 {
     int          i, m0, minmn, *perm, *invp;
     CHAM_tile_t *A, *U, *B;
-    struct cl_laswp_batched_args_t *clargs = ( struct cl_laswp_batched_args_t * ) cl_arg;
+    struct cl_zlaswp_batched_args_s *clargs = ( struct cl_zlaswp_batched_args_s * ) cl_arg;
 
     minmn = clargs->minmn;
     perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
@@ -73,14 +73,13 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
 {
     int task_num   = 0;
     int batch_size = ((struct chameleon_pzgetrf_s *)ws)->batch_size_swap;
-    int nhandles;
-    struct cl_laswp_batched_args_t *clargs = *clargs_ptr;
+    struct cl_zlaswp_batched_args_s *clargs = *clargs_ptr;
     if ( Am->get_rankof( Am, Amm, Amn) != Am->myrank ) {
         return;
     }
 
     if( clargs == NULL ) {
-        clargs = malloc( sizeof( struct cl_laswp_batched_args_t ) ) ;
+        clargs = malloc( sizeof( struct cl_zlaswp_batched_args_s ) ) ;
         clargs->tasks_nbr = 0;
         clargs->minmn     = minmn;
         *clargs_ptr       = clargs;
@@ -93,24 +92,12 @@ void INSERT_TASK_zlaswp_batched( const RUNTIME_option_t *options,
     clargs->tasks_nbr ++;
 
     if ( clargs->tasks_nbr == batch_size ) {
-        nhandles = clargs->tasks_nbr;
-        rt_starpu_insert_task(
-            &cl_zlaswp_batched,
-            STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
-            STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
-            STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
-            STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
-            STARPU_R,                   RTBLKADDR(Ak, ChamComplexDouble, Akm, Akn),
-            STARPU_DATA_MODE_ARRAY,     clargs->handle_mode, nhandles,
-            STARPU_PRIORITY,            options->priority,
-            STARPU_EXECUTE_ON_WORKER,   options->workerid,
-            0 );
-
-        /* clargs is freed by starpu. */
-        *clargs_ptr = NULL;
+        INSERT_TASK_zlaswp_batched_flush( options, ipiv, ipivk, Ak, Akm, Akn, U, Um, Un, clargs_ptr );
     }
 }
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
                                        const CHAM_ipiv_t      *ipiv,
                                        int                     ipivk,
@@ -122,7 +109,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
                                        int                     Un,
                                        void                  **clargs_ptr )
 {
-    struct cl_laswp_batched_args_t *clargs   = *clargs_ptr;
+    struct cl_zlaswp_batched_args_s *clargs   = *clargs_ptr;
     int                             nhandles;
 
     if( clargs == NULL ) {
@@ -132,7 +119,7 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
     nhandles = clargs->tasks_nbr;
     rt_starpu_insert_task(
         &cl_zlaswp_batched,
-        STARPU_CL_ARGS,             clargs, sizeof(struct cl_laswp_batched_args_t),
+        STARPU_CL_ARGS,             clargs, sizeof(struct cl_zlaswp_batched_args_s),
         STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
         STARPU_R,                   RUNTIME_invp_getaddr( ipiv, ipivk ),
         STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
@@ -145,3 +132,74 @@ void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
     /* clargs is freed by starpu. */
     *clargs_ptr = NULL;
 }
+
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+void INSERT_TASK_zlaswp_batched_flush( const RUNTIME_option_t *options,
+                                       const CHAM_ipiv_t      *ipiv,
+                                       int                     ipivk,
+                                       const CHAM_desc_t      *Ak,
+                                       int                     Akm,
+                                       int                     Akn,
+                                       const CHAM_desc_t      *U,
+                                       int                     Um,
+                                       int                     Un,
+                                       void                  **clargs_ptr )
+{
+    int ret, k;
+    struct starpu_task *task;
+    struct cl_zlaswp_batched_args_s *myclargs = *clargs_ptr;
+
+    if( myclargs == NULL ) {
+        return;
+    }
+
+    INSERT_TASK_COMMON_PARAMETERS( zlaswp_batched, myclargs->tasks_nbr + 4 );
+
+    /*
+     * Register the data handles, might need to receive perm and invp
+     */
+    starpu_cham_exchange_init_params( options, &params, Ak->myrank );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_perm_getaddr( ipiv, ipivk ),
+                                                  STARPU_R );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RUNTIME_invp_getaddr( ipiv, ipivk ),
+                                                  STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( U, ChamComplexDouble, Um, Un ),
+                                STARPU_RW | STARPU_COMMUTE );
+    starpu_cham_register_descr( &nbdata, descrs, RTBLKADDR( Ak, ChamComplexDouble, Akm, Akn ), STARPU_R );
+    for ( k = 0; k < myclargs->tasks_nbr; k++ ) {
+        starpu_cham_register_descr( &nbdata, descrs, myclargs->handle_mode[ k ].handle, STARPU_RW );
+    }
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    task->cl_arg      = myclargs;
+    task->cl_arg_size = sizeof( struct cl_zlaswp_batched_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = 0.;
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zlaswp_batched", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+
+    /* clargs is freed by starpu. */
+    *clargs_ptr = NULL;
+    (void)clargs;
+    (void)cl_name;
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zperm_allreduce.c b/runtime/starpu/codelets/codelet_zperm_allreduce.c
index 93bd984ab215224b751a325f775ec85e422df1a1..e32b7ad9c46a2303eb1c4c6a18d442935fca6d3a 100644
--- a/runtime/starpu/codelets/codelet_zperm_allreduce.c
+++ b/runtime/starpu/codelets/codelet_zperm_allreduce.c
@@ -21,8 +21,9 @@
 #include <coreblas/cblas_wrapper.h>
 
 #if defined(CHAMELEON_USE_MPI)
-struct cl_redux_args_t {
+struct cl_redux_args_s {
     int tempmm;
+    int mb;
     int n;
     int p;
     int q;
@@ -35,7 +36,7 @@ struct cl_redux_args_t {
 static void
 cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg )
 {
-    struct cl_redux_args_t      *clargs     = (struct cl_redux_args_t *) cl_arg;
+    struct cl_redux_args_s      *clargs     = (struct cl_redux_args_s *) cl_arg;
     const CHAM_tile_t           *tileUinout = cti_interface_get( descr[0] );
     const CHAM_tile_t           *tileUin    = cti_interface_get( descr[1] );
     const int                   *perm       = (int *)STARPU_VECTOR_GET_PTR( descr[2] );
@@ -43,6 +44,7 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg )
     const CHAMELEON_Complex64_t *Uin        = CHAM_tile_get_ptr( tileUin );
 
     int tempmm  = clargs->tempmm;
+    int mb      = clargs->mb;
     int n       = clargs->n;
     int p       = clargs->p;
     int q       = clargs->q;
@@ -51,7 +53,6 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg )
     int np      = clargs->np_inv;
     int me      = ( p <= np ) ? clargs->me / q : ( ( clargs->me / q ) - p_first + p ) % p;
     int nb      = tileUinout->n;
-    int mb      = tileUinout->m;
     int first   = me - 2 * shift + 1;
     int last    = me -     shift;
     int i, m, ownerp;
@@ -71,6 +72,8 @@ cl_zperm_allreduce_cpu_func( void *descr[], void *cl_arg )
 
 CODELETS_CPU( zperm_allreduce, cl_zperm_allreduce_cpu_func )
 
+#if defined(CHAMELEON_STARPU_USE_INSERT)
+
 static void
 INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options,
                                   CHAM_desc_t            *U,
@@ -101,9 +104,10 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
                                   int                     np,
                                   int                     p_first )
 {
-    struct cl_redux_args_t *clargs;
-    clargs = malloc( sizeof( struct cl_redux_args_t ) );
+    struct cl_redux_args_s *clargs;
+    clargs = malloc( sizeof( struct cl_redux_args_s ) );
     clargs->tempmm  = tempmm;
+    clargs->mb      = U->mb;
     clargs->n       = n;
     clargs->p       = p;
     clargs->q       = q;
@@ -114,7 +118,7 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
 
     rt_starpu_insert_task(
         &cl_zperm_allreduce,
-        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_t),
+        STARPU_CL_ARGS,           clargs, sizeof(struct cl_redux_args_s),
         STARPU_RW,                RTBLKADDR(U, CHAMELEON_Complex64_t, me,  n),
         STARPU_R,                 RTBLKADDR(U, CHAMELEON_Complex64_t, src, n),
         STARPU_R,                 RUNTIME_perm_getaddr( ipiv, ipivk ),
@@ -125,6 +129,96 @@ INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
     starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) );
 }
 
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
+static void
+INSERT_TASK_zperm_allreduce_send( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *U,
+                                  int                     me,
+                                  int                     dst,
+                                  int                     n )
+{
+    INSERT_TASK_COMMON_PARAMETERS_CLNULL( zperm_allreduce_send, 1 );
+
+    starpu_cham_exchange_init_params( options, &params, dst );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RTBLKADDR( U, ChamComplexDouble, me, n ),
+                                                  STARPU_R );
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+    (void)cl;
+    (void)cl_name;
+}
+
+static void
+INSERT_TASK_zperm_allreduce_recv( const RUNTIME_option_t *options,
+                                  CHAM_desc_t            *U,
+                                  CHAM_ipiv_t            *ipiv,
+                                  int                     ipivk,
+                                  int                     me,
+                                  int                     src,
+                                  int                     n,
+                                  int                     tempmm,
+                                  int                     p,
+                                  int                     q,
+                                  int                     shift,
+                                  int                     np,
+                                  int                     p_first )
+{
+    int ret;
+    struct starpu_task *task;
+
+    INSERT_TASK_COMMON_PARAMETERS_EXTENDED( zperm_allreduce_send, zperm_allreduce, redux, 3 );
+
+    starpu_cham_exchange_init_params( options, &params, me );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RTBLKADDR( U, ChamComplexDouble, me, n ),
+                                                  STARPU_RW );
+    starpu_cham_exchange_handle_before_execution( options, &params, &nbdata, descrs,
+                                                  RTBLKADDR( U, ChamComplexDouble, src, n ),
+                                                  STARPU_R );
+    starpu_cham_register_descr( &nbdata, descrs, RUNTIME_perm_getaddr( ipiv, ipivk ), STARPU_R );
+
+    task = starpu_task_create();
+    task->cl = cl;
+
+    /* Set codelet parameters */
+    clargs = malloc( sizeof( struct cl_redux_args_s ) );
+    clargs->tempmm  = tempmm;
+    clargs->mb      = U->mb;
+    clargs->n       = n;
+    clargs->p       = p;
+    clargs->q       = q;
+    clargs->p_first = p_first;
+    clargs->me      = me;
+    clargs->shift   = shift;
+    clargs->np_inv  = np;
+
+    task->cl_arg      = clargs;
+    task->cl_arg_size = sizeof( struct cl_redux_args_s );
+    task->cl_arg_free = 1;
+
+    /* Set common parameters */
+    starpu_cham_task_set_options( options, task, nbdata, descrs, NULL );
+
+    /* Flops */
+    task->flops = 0.;
+
+    /* Refine name */
+    task->name = cl_name;
+
+    ret = starpu_task_submit( task );
+    if ( ret == -ENODEV ) {
+        task->destroy = 0;
+        starpu_task_destroy( task );
+        chameleon_error( "INSERT_TASK_zperm_allreduce", "Failed to submit the task to StarPU" );
+        return;
+    }
+    starpu_cham_task_exchange_data_after_execution( options, params, nbdata, descrs );
+    starpu_mpi_cache_flush( options->sequence->comm, RTBLKADDR(U, CHAMELEON_Complex64_t, src, n) );
+}
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
+
 static void
 zperm_allreduce_chameleon_starpu_task( const RUNTIME_option_t     *options,
                                        const CHAM_desc_t          *A,
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index fb83eccc0d65e348efa665eb4f84df5563e246a4..80be1490512d9cc63ddfc6a2f9d98cd6d9f92585 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -66,6 +66,7 @@ CODELETS_CPU( zpotrf, cl_zpotrf_cpu_func )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n, int nb,
                          const CHAM_desc_t *A, int Am, int An,
@@ -116,7 +117,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n, int nb,
@@ -129,7 +130,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, A->get_rankof( A, Am, An ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_RW );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_RW );
 
     /*
      * Not involved, let's return
@@ -180,4 +181,4 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c
index 4298cd8dd3716c1d112949eaa6689cbacad68a37..c715baff60f576206889892a9d6e07263041c9a1 100644
--- a/runtime/starpu/codelets/codelet_zsymm.c
+++ b/runtime/starpu/codelets/codelet_zsymm.c
@@ -127,6 +127,7 @@ CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options,
                               cham_side_t side, cham_uplo_t uplo,
                               int m, int n, int nb,
@@ -275,7 +276,7 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
         0 );
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void __INSERT_TASK_zsymm( const RUNTIME_option_t *options,
                           cham_side_t side, cham_uplo_t uplo,
@@ -296,9 +297,9 @@ void __INSERT_TASK_zsymm( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, xrank );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -392,4 +393,5 @@ void INSERT_TASK_zsymm( const RUNTIME_option_t *options,
                                 B, Bm, Bn,
                          beta,  C, Cm, Cn );
 }
-#endif
+
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c
index 6ab60fbe5831fc58ccb18c8b1c3d8346ef49ca47..145b926466b2758ada8efd859bc1924bff52ad8e 100644
--- a/runtime/starpu/codelets/codelet_zsyr2k.c
+++ b/runtime/starpu/codelets/codelet_zsyr2k.c
@@ -109,6 +109,7 @@ CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
                          int n, int k, int nb,
@@ -176,7 +177,7 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans,
@@ -201,9 +202,9 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -257,4 +258,4 @@ void INSERT_TASK_zsyr2k( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index d2ea231706b2409f5c440986f0af2175dcb40390..722aa51729022207759258c583bc6c9363851dcd 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -111,6 +111,7 @@ CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
                         int n, int k, int nb,
@@ -176,7 +177,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                         cham_uplo_t uplo, cham_trans_t trans,
@@ -200,8 +201,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, C->get_rankof( C, Cm, Cn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, C, Cm, Cn, accessC  );
 
     /*
      * Not involved, let's return
@@ -254,4 +255,4 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c
index 3c9786734efe00dd355871336ab954922a4e0fcb..f88fdae60663dd49d12a53e4d5a822d466e603b2 100644
--- a/runtime/starpu/codelets/codelet_ztradd.c
+++ b/runtime/starpu/codelets/codelet_ztradd.c
@@ -54,6 +54,7 @@ cl_ztradd_cpu_func(void *descr[], void *cl_arg)
 CODELETS_CPU( ztradd, cl_ztradd_cpu_func )
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
                          CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
@@ -112,7 +113,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int nb,
@@ -136,8 +137,8 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, accessB  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, accessB  );
 
     /*
      * Not involved, let's return
@@ -190,4 +191,4 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index bafef35505b4bed91e0ef7f268102589ed4a3fec..c44697a83b9c562d722df59c6328bffd53ef797a 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -107,6 +107,7 @@ CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
                         int m, int n, int nb,
@@ -162,7 +163,7 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
@@ -176,8 +177,8 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R  );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
 
     /*
      * Not involved, let's return
@@ -231,4 +232,4 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 70fdf057fa3e087bfe9e0d4df61b8727c7aae492..719ce05c3369c4433976554e105ea5a18da73872 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -114,6 +114,7 @@ CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC )
 #endif
 
 #if defined(CHAMELEON_STARPU_USE_INSERT)
+
 void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
                         int m, int n, int nb,
@@ -170,7 +171,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#else
+#else /* defined(CHAMELEON_STARPU_USE_INSERT) */
 
 void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
                         cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,
@@ -184,8 +185,8 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
      * Set the data handles and initialize exchanges if needed
      */
     starpu_cham_exchange_init_params( options, &params, B->get_rankof( B, Bm, Bn ) );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R  );
-    starpu_cham_exchange_data_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, A, Am, An, STARPU_R  );
+    starpu_cham_exchange_tile_before_execution( options, &params, &nbdata, descrs, B, Bm, Bn, STARPU_RW );
 
     /*
      * Not involved, let's return
@@ -239,4 +240,4 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
     (void)nb;
 }
 
-#endif
+#endif /* defined(CHAMELEON_STARPU_USE_INSERT) */
diff --git a/runtime/starpu/include/chameleon_starpu_internal.h b/runtime/starpu/include/chameleon_starpu_internal.h
index c202955a80a5454fdb7c64bbb1f9db5ccd29db17..1100baeeb5a5065d5565697187162aa974c727ac 100644
--- a/runtime/starpu/include/chameleon_starpu_internal.h
+++ b/runtime/starpu/include/chameleon_starpu_internal.h
@@ -226,11 +226,65 @@ chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n)
     const char                  *cl_name = #_name_;             \
     int                          nbdata  = 0;
 
+#define INSERT_TASK_COMMON_PARAMETERS_EXTENDED( _name_task_, _name_cl_, _name_arg_, _nbuffer_ ) \
+    struct starpu_data_descr descrs[_nbuffer_];                                                 \
+    struct starpu_mpi_task_exchange_params params;                                              \
+    struct cl_##_name_arg_##_args_s *clargs  = NULL;                                            \
+    struct starpu_codelet           *cl      = &cl_##_name_cl_;                                 \
+    const char                      *cl_name = #_name_task_;                                    \
+    int                              nbdata  = 0;
+
+#define INSERT_TASK_COMMON_PARAMETERS_CLNULL( _name_, _nbuffer_ ) \
+    struct starpu_data_descr descrs[_nbuffer_];                   \
+    struct starpu_mpi_task_exchange_params params;                \
+    struct starpu_codelet           *cl      = NULL;              \
+    const char                      *cl_name = #_name_;           \
+    int                              nbdata  = 0;
+
 /**
  * This section defines the codelet functions to manage MPI cache and data
  * echanges before and after submitting tasks
  */
 #if !defined(CHAMELEON_STARPU_USE_INSERT)
+
+/**
+ * @brief Internal function to initialize the StarPU paramas structure.
+ *
+ * @param[in,out] nbdata
+ *          On entry the number of data already registered in descrs. On exist,
+ *          the counter is updated if the next handle is registered in the
+ *          structure.
+ *
+ * @param[in,out] descrs
+ *          The array of starpu data descriptors (handle + mode). On entry, it
+ *          is allcoated to the maximum number of data for the task, and
+ *          contains the already registered nbdata handles and their associated
+ *          modes. On exit, it is updated with the new handle if needed.
+ *
+ * @param[in] handle
+ *          The data handle
+ *
+ * @param[in] mode
+ *          The access mode
+ *
+ */
+static inline void
+starpu_cham_register_descr( int                          *nbdata,
+                            struct starpu_data_descr     *descrs,
+                            starpu_data_handle_t          handle,
+                            enum starpu_data_access_mode  mode )
+{
+    if ( mode & STARPU_NONE ) {
+        return;
+    }
+
+    descrs[*nbdata].handle = handle;
+    descrs[*nbdata].mode   = mode;
+
+    (*nbdata)++;
+    return;
+}
+
 #if !defined(CHAMELEON_USE_MPI)
 
 /**
@@ -251,7 +305,22 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t                 *option
 }
 
 static inline void
-starpu_cham_exchange_data_before_execution( const RUNTIME_option_t                 *options,
+starpu_cham_exchange_handle_before_execution( const RUNTIME_option_t                 *options,
+                                              struct starpu_mpi_task_exchange_params *params,
+                                              int                                    *nbdata,
+                                              struct starpu_data_descr               *descrs,
+                                              starpu_data_handle_t                    handle,
+                                              enum starpu_data_access_mode            mode )
+{
+    starpu_cham_register_descr( nbdata, descrs, handle, mode );
+
+    (void)options;
+    (void)params;
+    return;
+}
+
+static inline void
+starpu_cham_exchange_tile_before_execution( const RUNTIME_option_t                 *options,
                                             struct starpu_mpi_task_exchange_params *params,
                                             int                                    *nbdata,
                                             struct starpu_data_descr               *descrs,
@@ -260,9 +329,7 @@ starpu_cham_exchange_data_before_execution( const RUNTIME_option_t
                                             int                                     An,
                                             enum starpu_data_access_mode            mode )
 {
-    descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An );
-    descrs[*nbdata].mode   = mode;
-    (*nbdata)++;
+    starpu_cham_register_descr( nbdata, descrs, RTBLKADDR( A, ChamComplexDouble, Am, An ), mode );
 
     (void)options;
     (void)params;
@@ -301,6 +368,56 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t                 *option
     params->exchange_needed = 0;
 }
 
+/**
+ * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(),
+ * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros
+ * in other runtimes.
+ *
+ * @param[in] options
+ *          The options to parameterize the task
+ *
+ * @param[in] params
+ *          The starpu parameters for the exchange functions. Needs to be
+ *          initialized by starpu_cham_init_exchange_param() function.
+ *
+ * @param[in,out] nbdata
+ *          On entry the number of data already registered in descrs. On exist,
+ *          the counter is updated if the next handle is registered in the
+ *          structure.
+ *
+ * @param[in,out] descrs
+ *          The array of starpu data descriptors (handle + mode). On entry, it
+ *          is allcoated to the maximum number of data for the task, and
+ *          contains the already registered nbdata handles and their associated
+ *          modes. On exit, it is updated with the new handle if needed.
+ *
+ * @param[in] handle
+ *          The data handle
+ *
+ * @param[in] mode
+ *          The access mode
+ *
+ */
+static inline void
+starpu_cham_exchange_handle_before_execution( const RUNTIME_option_t                 *options,
+                                              struct starpu_mpi_task_exchange_params *params,
+                                              int                                    *nbdata,
+                                              struct starpu_data_descr               *descrs,
+                                              starpu_data_handle_t                    handle,
+                                              enum starpu_data_access_mode            mode )
+{
+    if ( mode & STARPU_NONE ) {
+        return;
+    }
+
+    starpu_cham_register_descr( nbdata, descrs, handle, mode );
+
+    starpu_mpi_exchange_data_before_execution( options->sequence->comm,
+                                               handle, mode, params );
+
+    return;
+}
+
 /**
  * @brief Internal wrapper to starpu_mpi_task_exchange_data_before_execution(),
  * that also perform the cache operation done in the CAHMELEON_ACCESS_X() macros
@@ -338,7 +455,7 @@ starpu_cham_exchange_init_params( const RUNTIME_option_t                 *option
  *
  */
 static inline void
-starpu_cham_exchange_data_before_execution( const RUNTIME_option_t                 *options,
+starpu_cham_exchange_tile_before_execution( const RUNTIME_option_t                 *options,
                                             struct starpu_mpi_task_exchange_params *params,
                                             int                                    *nbdata,
                                             struct starpu_data_descr               *descrs,
@@ -377,13 +494,8 @@ starpu_cham_exchange_data_before_execution( const RUNTIME_option_t
      * If we need to submit, let's create the data handle and ask StarPU to perform
      * the necessary communications
      */
-    descrs[*nbdata].handle = RTBLKADDR( A, ChamComplexDouble, Am, An );
-    descrs[*nbdata].mode   = mode;
-
-    starpu_mpi_exchange_data_before_execution(
-        options->sequence->comm, descrs[*nbdata].handle, mode, params );
-
-    (*nbdata)++;
+    starpu_cham_exchange_handle_before_execution( options, params, nbdata, descrs,
+                                                  RTBLKADDR( A, ChamComplexDouble, Am, An ), mode );
     return;
 }
 
@@ -447,6 +559,7 @@ starpu_cham_task_set_options( const RUNTIME_option_t   *options,
                               struct starpu_data_descr *descrs,
                               callback_fct_t            callback )
 {
+    int allocated_buffers = 0;
     int i;
 
     task->priority = options->priority;
@@ -473,6 +586,10 @@ starpu_cham_task_set_options( const RUNTIME_option_t   *options,
     // task->where; /* Do restriction here */
 
     task->nbuffers = nbdata;
+
+    /* Dynamic handles */
+    starpu_task_insert_data_make_room( task->cl, task, &allocated_buffers, 0, task->nbuffers );
+
     for ( i = 0; i < task->nbuffers; i++ ) {
         enum starpu_data_access_mode mode = descrs[i].mode;
         assert( descrs[i].handle );
diff --git a/testing/CTestLists.cmake b/testing/CTestLists.cmake
index c8d012141de283ea45c4ebe3a4ae3f9270d7e435..39b7e89e04daf060dcacc87d17b851ee83e3191d 100644
--- a/testing/CTestLists.cmake
+++ b/testing/CTestLists.cmake
@@ -94,26 +94,28 @@ if (NOT CHAMELEON_SIMULATION)
         endforeach()
 
         if ( CHAMELEON_SCHED_STARPU )
-            add_test( test_${cat}_${prec}getrf_nopivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 --diag=ChamUnit -f input/getrf_nopiv.in )
-            set_tests_properties( test_${cat}_${prec}getrf_nopivpercol
-                                  PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=nopivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" )
-
             if ( HAVE_STARPU_NONE_NONZERO )
-                add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
+                add_test( test_${cat}_${prec}getrf_ppivpercol ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
                 set_tests_properties( test_${cat}_${prec}getrf_ppivpercol
-                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" )
+                                      PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=0" )
 
-                add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf_nopiv.in )
+                add_test( test_${cat}_${prec}getrf_ppivpercol_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
                 set_tests_properties( test_${cat}_${prec}getrf_ppivpercol_batch
-                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=6" )
+                                      PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppivpercolumn;CHAMELEON_GETRF_BATCH_SIZE=3" )
+
+                add_test( test_${cat}_${prec}getrf_ppivblocked ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
+                set_tests_properties( test_${cat}_${prec}getrf_ppivblocked
+                                      PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" )
 
-                add_test( test_${cat}_${prec}getrf_ppiv ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
-                set_tests_properties( test_${cat}_${prec}getrf_ppiv
-                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0" )
+                add_test( test_${cat}_${prec}getrf_ppivblocked_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
+                set_tests_properties( test_${cat}_${prec}getrf_ppivblocked_batch
+                                      PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=3" )
 
-                add_test( test_${cat}_${prec}getrf_ppiv_batch ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P 1 -f input/getrf.in )
-                set_tests_properties( test_${cat}_${prec}getrf_ppiv_batch
-                                    PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=6" )
+                if ( ${cat} STREQUAL "mpi" )
+                    add_test( test_${cat}_${prec}getrf_ppiv_comm_with_task ${PREFIX} ${CMD} -c -t ${THREADS} -g ${gpus} -P ${NP} -f input/getrf.in )
+                    set_tests_properties( test_${cat}_${prec}getrf_ppiv_comm_with_task
+                                          PROPERTIES ENVIRONMENT "CHAMELEON_GETRF_ALGO=ppiv;CHAMELEON_GETRF_BATCH_SIZE=0;CHAMELEON_GETRF_ALL_REDUCE=cham_spu_tasks" )
+                endif()
             endif()
         endif()
 
diff --git a/testing/input/getrf.in b/testing/input/getrf.in
index c0c99c52c69b54daa2f915d3fb636ad805b88b84..fec8a04f08f6b357e3e609b7f4421ff83281be47 100644
--- a/testing/input/getrf.in
+++ b/testing/input/getrf.in
@@ -11,6 +11,6 @@
 op = getrf
 nb = 16, 17
 ib = 16, 5
-m = 13, 17, 35
-n = 15, 19, 33
-lda = 41
+m = 13, 17, 35, 130
+n = 15, 19, 33, 115
+lda = 131