diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c515d7e5c5dfa0d0b0e4980a3336eb2ecdbb32b..b064cf15135af1b366763e1e8a78e9b903f02047 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -366,15 +366,6 @@ if(NOT CHAMELEON_SIMULATION)
             endif()
             if (CUDA_LIBRARIES)
                 set_target_properties(CUDA::CUDA PROPERTIES INTERFACE_LINK_LIBRARIES "${CUDA_LIBRARIES}")
-                set(CMAKE_REQUIRED_INCLUDES  "${CUDA_INCLUDE_DIRS}")
-                set(CMAKE_REQUIRED_LIBRARIES "${CUDA_LIBRARIES}")
-                if(CUDA_VERSION VERSION_LESS "4.0")
-                    set(CUDA_HAVE_PEER_DEVICE_MEMORY_ACCESS 0)
-                else()
-                    check_function_exists(cuDeviceCanAccessPeer CUDA_HAVE_PEER_DEVICE_MEMORY_ACCESS)
-                endif()
-                unset(CMAKE_REQUIRED_INCLUDES)
-                unset(CMAKE_REQUIRED_LIBRARIES)
                 # Add cublas if found
                 if (CUDA_CUBLAS_LIBRARIES)
                     set_target_properties(CUDA::CUBLAS PROPERTIES INTERFACE_LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}")
diff --git a/control/descriptor.h b/control/descriptor.h
index 6bf6d9a81ad9cb57458207b500b45b8fdc0bdc3d..dda8fa71906169a3dde202e670e27c6c1c70467b 100644
--- a/control/descriptor.h
+++ b/control/descriptor.h
@@ -233,33 +233,41 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n )
  * CHAMELEON_ACCESS_RW(C, Cm, Cn)
  * CHAMELEON_END_ACCESS_DECLARATION
  */
-#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \
-    unsigned __chameleon_need_submit = 0; \
+#define CHAMELEON_BEGIN_ACCESS_DECLARATION {    \
+    unsigned __chameleon_need_exec = 0;         \
+    unsigned __chameleon_need_submit = 0;       \
     RUNTIME_BEGIN_ACCESS_DECLARATION
 
-#define CHAMELEON_ACCESS_R(A, Am, An) do { \
-    if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \
-    RUNTIME_ACCESS_R(A, Am, An); \
-} while(0)
-
-#define CHAMELEON_ACCESS_W(A, Am, An) do { \
-    if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \
-    RUNTIME_ACCESS_W(A, Am, An); \
-} while(0)
-
-#define CHAMELEON_ACCESS_RW(A, Am, An) do { \
-    if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \
-    RUNTIME_ACCESS_RW(A, Am, An); \
-} while(0)
-
-#define CHAMELEON_RANK_CHANGED(rank) do {\
-    __chameleon_need_submit = 1; \
-    RUNTIME_RANK_CHANGED(rank); \
-} while (0)
-
-#define CHAMELEON_END_ACCESS_DECLARATION \
-    RUNTIME_END_ACCESS_DECLARATION; \
-    if (!__chameleon_need_submit) return; \
+#define CHAMELEON_ACCESS_R(A, Am, An) do {                              \
+        if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \
+        RUNTIME_ACCESS_R(A, Am, An);                                    \
+    } while(0)
+
+#define CHAMELEON_ACCESS_W(A, Am, An) do {              \
+        if (chameleon_desc_islocal(A, Am, An)) {        \
+            __chameleon_need_exec = 1;                  \
+            __chameleon_need_submit = 1;                \
+        }                                               \
+        RUNTIME_ACCESS_W(A, Am, An);                    \
+    } while(0)
+
+#define CHAMELEON_ACCESS_RW(A, Am, An) do {             \
+        if (chameleon_desc_islocal(A, Am, An)) {        \
+            __chameleon_need_exec = 1;                  \
+            __chameleon_need_submit = 1;                \
+        }                                               \
+        RUNTIME_ACCESS_RW(A, Am, An);                   \
+    } while(0)
+
+#define CHAMELEON_RANK_CHANGED(rank) do {       \
+        __chameleon_need_submit = 1;            \
+        RUNTIME_RANK_CHANGED(rank);             \
+    } while (0)
+
+#define CHAMELEON_END_ACCESS_DECLARATION        \
+    RUNTIME_END_ACCESS_DECLARATION;             \
+    if (!__chameleon_need_submit) return;       \
+    (void)__chameleon_need_exec;                \
 }
 
 #ifdef __cplusplus
diff --git a/runtime/starpu/codelets/codelet_zcesca.c b/runtime/starpu/codelets/codelet_zcesca.c
index 5dbadec77774438ef701cb9d07b4f7af579094f1..1cf87fe7e330d4f636c648cb4a7886378e93a714 100644
--- a/runtime/starpu/codelets/codelet_zcesca.c
+++ b/runtime/starpu/codelets/codelet_zcesca.c
@@ -31,7 +31,7 @@ struct cl_zcesca_args_s {
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zcesca_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zcesca_args_s clargs;
+    struct cl_zcesca_args_s *clargs = (struct cl_zcesca_args_s *)cl_arg;
     CHAM_tile_t *Gi;
     CHAM_tile_t *Gj;
     CHAM_tile_t *G;
@@ -46,9 +46,8 @@ static void cl_zcesca_cpu_func(void *descr[], void *cl_arg)
     Dj = cti_interface_get(descr[4]);
     A  = cti_interface_get(descr[5]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zcesca( clargs.center, clargs.scale, clargs.axis,
-                  clargs.m, clargs.n, clargs.mt, clargs.nt,
+    TCORE_zcesca( clargs->center, clargs->scale, clargs->axis,
+                  clargs->m, clargs->n, clargs->mt, clargs->nt,
                   Gi, Gj, G, Di, Dj, A );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -68,19 +67,12 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options,
                          const CHAM_desc_t *Dj, int Djm, int Djn,
                          CHAM_desc_t *A, int Am, int An )
 {
-    struct cl_zcesca_args_s clargs = {
-        .center = center,
-        .scale  = scale,
-        .axis   = axis,
-        .m      = m,
-        .n      = n,
-        .mt     = mt,
-        .nt     = nt
-    };
+    struct cl_zcesca_args_s *clargs = NULL;
     struct starpu_codelet *codelet = &cl_zcesca;
     void (*callback)(void*) = options->profiling ? cl_zcesca_callback : NULL;
     starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt);
     int workerid = (schedopt == NULL) ? -1 : schedopt->workerid;
+    int exec = 0;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(Gi, Gim, Gin);
@@ -89,11 +81,23 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options,
     CHAMELEON_ACCESS_R(Di, Dim, Din);
     CHAMELEON_ACCESS_R(Dj, Djm, Djn);
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zcesca_args_s ) );
+        clargs->center = center;
+        clargs->scale  = scale;
+        clargs->axis   = axis;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->mt     = mt;
+        clargs->nt     = nt;
+    }
+
     rt_starpu_insert_task(
         codelet,
-        STARPU_VALUE, &clargs, sizeof(struct cl_zcesca_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zcesca_args_s),
         STARPU_R,        RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin),
         STARPU_R,        RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn),
         STARPU_R,        RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn),
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 39d242c01e4d6bf4f41eeb58a22f1b1e94612393..e66b85dedb1c17a8bff8e8ceb49f18fd662060dd 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -19,6 +19,7 @@
  * @author Cedric Castagnede
  * @author Lucas Barros de Assis
  * @author Florent Pruvost
+ * @author Gwenole Lucas
  * @date 2021-03-16
  * @precisions normal z -> c d s
  *
@@ -43,7 +44,7 @@ struct cl_zgemm_args_s {
 static void
 cl_zgemm_cpu_func( void *descr[], void *cl_arg )
 {
-    struct cl_zgemm_args_s clargs;
+    struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAM_tile_t *tileC;
@@ -52,18 +53,17 @@ cl_zgemm_cpu_func( void *descr[], void *cl_arg )
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zgemm( clargs.transA, clargs.transB,
-                 clargs.m, clargs.n, clargs.k,
-                 clargs.alpha, tileA, tileB,
-                 clargs.beta,  tileC );
+    TCORE_zgemm( clargs->transA, clargs->transB,
+                 clargs->m, clargs->n, clargs->k,
+                 clargs->alpha, tileA, tileB,
+                 clargs->beta,  tileC );
 }
 
 #ifdef CHAMELEON_USE_CUDA
 static void
-cl_zgemm_cuda_func( void *descr[], void *_cl_arg )
+cl_zgemm_cuda_func( void *descr[], void *cl_arg )
 {
-    struct cl_zgemm_args_s clargs;
+    struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
     CHAM_tile_t *tileC;
@@ -72,17 +72,19 @@ cl_zgemm_cuda_func( void *descr[], void *_cl_arg )
     tileB = cti_interface_get(descr[1]);
     tileC = cti_interface_get(descr[2]);
 
-    starpu_codelet_unpack_args( _cl_arg, &clargs );
-
     RUNTIME_getStream( stream );
 
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+
     CUDA_zgemm(
-        clargs.transA, clargs.transB,
-        clargs.m, clargs.n, clargs.k,
-        (cuDoubleComplex*)&(clargs.alpha),
+        clargs->transA, clargs->transB,
+        clargs->m, clargs->n, clargs->k,
+        (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         tileB->mat, tileB->ld,
-        (cuDoubleComplex*)&(clargs.beta),
+        (cuDoubleComplex*)&(clargs->beta),
         tileC->mat, tileC->ld,
         stream );
 
@@ -112,22 +114,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
                                     beta, C, Cm, Cn );
     }
 
-    struct cl_zgemm_args_s clargs = {
-        .transA = transA,
-        .transB = transB,
-        .m      = m,
-        .n      = n,
-        .k      = k,
-        .alpha  = alpha,
-        .tileA  = A->get_blktile( A, Am, An ),
-        .tileB  = B->get_blktile( B, Bm, Bn ),
-        .beta   = beta,
-        .tileC  = C->get_blktile( C, Cm, Cn )
-    };
+    struct cl_zgemm_args_s  *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid, accessC;
+    int                      exec = 0;
     char                    *cl_name = "zgemm";
 
     /* Handle cache */
@@ -135,8 +127,23 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_R(B, Bm, Bn);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zgemm_args_s ) );
+        clargs->transA = transA;
+        clargs->transB = transB;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->k      = k;
+        clargs->alpha  = alpha;
+        clargs->tileA  = A->get_blktile( A, Am, An );
+        clargs->tileB  = B->get_blktile( B, Bm, Bn );
+        clargs->beta   = beta;
+        clargs->tileC  = C->get_blktile( C, Cm, Cn );
+    }
+
     /* Callback for profiling information */
     callback = options->profiling ? cl_zgemm_callback : NULL;
 
@@ -150,10 +157,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zgemm,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zgemm_args_s),
-        STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
-        STARPU_R,      RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
-        accessC,       RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s),
+
+        /* Task handles */
+        STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
+        accessC,  RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
 
         /* Common task arguments */
         STARPU_PRIORITY,          options->priority,
diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c
index 5fbc6b09a9a1f2d733e9a4f52aecc7320a17b08f..5dad32c9afae12291dbc1729cd3527264d86cac8 100644
--- a/runtime/starpu/codelets/codelet_zgessm.c
+++ b/runtime/starpu/codelets/codelet_zgessm.c
@@ -43,7 +43,6 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg)
     tileD = cti_interface_get(descr[1]);
     tileA = cti_interface_get(descr[2]);
 
-
     starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV);
     TCORE_zgessm(m, n, k, ib, IPIV, tileD, tileA);
 }
diff --git a/runtime/starpu/codelets/codelet_zgesum.c b/runtime/starpu/codelets/codelet_zgesum.c
index 31a6ca93e5c8f9fc74a12e656309ca3059cb731d..c851c4ef555bc1ee792a712bfaa028795df4252c 100644
--- a/runtime/starpu/codelets/codelet_zgesum.c
+++ b/runtime/starpu/codelets/codelet_zgesum.c
@@ -27,15 +27,14 @@ struct cl_zgesum_args_s {
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zgesum_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zgesum_args_s clargs;
+    struct cl_zgesum_args_s *clargs = (struct cl_zgesum_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileW;
 
     tileA = cti_interface_get(descr[0]);
     tileW = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zgesum( clargs.storev, clargs.m, clargs.n, tileA, tileW );
+    TCORE_zgesum( clargs->storev, clargs->m, clargs->n, tileA, tileW );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -49,24 +48,29 @@ void INSERT_TASK_zgesum( const RUNTIME_option_t *options,
                          const CHAM_desc_t *A, int Am, int An,
                          const CHAM_desc_t *SUMS, int SUMSm, int SUMSn )
 {
-    struct cl_zgesum_args_s clargs = {
-        .storev = storev,
-        .m      = m,
-        .n      = n
-    };
+    struct cl_zgesum_args_s *clargs = NULL;
     struct starpu_codelet *codelet = &cl_zgesum;
     void (*callback)(void*) = options->profiling ? cl_zgesum_callback : NULL;
     starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt);
     int workerid = (schedopt == NULL) ? -1 : schedopt->workerid;
+    int exec = 0;
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(SUMS, SUMSm, SUMSn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zgesum_args_s ) );
+        clargs->storev = storev;
+        clargs->m      = m;
+        clargs->n      = n;
+    }
+
     rt_starpu_insert_task(
         codelet,
-        STARPU_VALUE, &clargs, sizeof(struct cl_zgesum_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zgesum_args_s),
         STARPU_R,        RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         STARPU_RW,       RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn),
         STARPU_PRIORITY, options->priority,
diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c
index 91b62cec66d69b9ce8f6340b781a0b5ba167b99c..9c918d9ef5c2b37de710f2e766cf448de8bbfe99 100644
--- a/runtime/starpu/codelets/codelet_zhe2ge.c
+++ b/runtime/starpu/codelets/codelet_zhe2ge.c
@@ -35,7 +35,6 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-
     starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N);
     TCORE_zhe2ge(uplo, M, N, tileA, tileB);
 }
diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c
index 1b8057b1409ef597ab99892a7482f811249d4620..3cc33f4236be6b6619f91c1032df22e787d11724 100644
--- a/runtime/starpu/codelets/codelet_zherk.c
+++ b/runtime/starpu/codelets/codelet_zherk.c
@@ -19,6 +19,7 @@
  * @author Cedric Castagnede
  * @author Lucas Barros de Assis
  * @author Florent Pruvost
+ * @author Gwenole Lucas
  * @date 2021-03-16
  * @precisions normal z -> c
  *
@@ -41,38 +42,35 @@ struct cl_zherk_args_s {
 static void
 cl_zherk_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zherk_args_s clargs;
+    struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zherk( clargs.uplo, clargs.trans, clargs.n, clargs.k,
-                 clargs.alpha, tileA, clargs.beta, tileC );
+    TCORE_zherk( clargs->uplo, clargs->trans, clargs->n, clargs->k,
+                 clargs->alpha, tileA, clargs->beta, tileC );
 }
 
 #if defined(CHAMELEON_USE_CUDA)
 static void
 cl_zherk_cuda_func(void *descr[], void *cl_arg)
 {
-    struct cl_zherk_args_s clargs;
+    struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-
     RUNTIME_getStream(stream);
 
     CUDA_zherk(
-        clargs.uplo, clargs.trans, clargs.n, clargs.k,
-        (cuDoubleComplex*)&(clargs.alpha),
+        clargs->uplo, clargs->trans, clargs->n, clargs->k,
+        &(clargs->alpha),
         tileA->mat, tileA->ld,
-        (cuDoubleComplex*)&(clargs.beta),
+        &(clargs->beta),
         tileC->mat, tileC->ld,
         stream );
 
@@ -101,28 +99,33 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
                                     beta, C, Cm, Cn );
     }
 
-    struct cl_zherk_args_s clargs = {
-        .uplo  = uplo,
-        .trans = trans,
-        .n     = n,
-        .k     = k,
-        .alpha = alpha,
-        .tileA = A->get_blktile( A, Am, An ),
-        .beta  = beta,
-        .tileC = C->get_blktile( C, Cm, Cn ),
-    };
+    struct cl_zherk_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid, accessC;
+    int                      exec = 0;
     char                    *cl_name = "zherk";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zherk_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zherk_callback : NULL;
 
@@ -136,7 +139,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zherk,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zherk_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zherk_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         accessC,       RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
 
diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c
index 2501eb14833810892e7a5ced85f1f7659eb1d933..b169055095f5d54d5f825921306f28150b4df275 100644
--- a/runtime/starpu/codelets/codelet_zlacpy.c
+++ b/runtime/starpu/codelets/codelet_zlacpy.c
@@ -41,20 +41,19 @@ struct cl_zlacpy_args_s {
 static void
 cl_zlacpy_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zlacpy_args_s clargs;
+    struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    assert( clargs.displA == 0 );
-    assert( clargs.displB == 0 );
+    assert( clargs->displA == 0 );
+    assert( clargs->displB == 0 );
     /* A = tileA->mat; */
     /* B = tileB->mat; */
     /* CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); */
-    TCORE_zlacpy( clargs.uplo, clargs.m, clargs.n, tileA, tileB );
+    TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -68,27 +67,32 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
                           int displA, const CHAM_desc_t *A, int Am, int An,
                           int displB, const CHAM_desc_t *B, int Bm, int Bn )
 {
-    struct cl_zlacpy_args_s clargs = {
-        .uplo   = uplo,
-        .m      = m,
-        .n      = n,
-        .displA = displA,
-        .displB = displB,
-        .tileA  = A->get_blktile( A, Am, An ),
-        .tileB  = B->get_blktile( B, Bm, Bn ),
-    };
+    struct cl_zlacpy_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zlacpy";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_W(B, Bm, Bn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zlacpy_args_s ) );
+        clargs->uplo   = uplo;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->displA = displA;
+        clargs->displB = displB;
+        clargs->tileA  = A->get_blktile( A, Am, An );
+        clargs->tileB  = B->get_blktile( B, Bm, Bn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zlacpy_callback : NULL;
 
@@ -99,7 +103,7 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zlacpy,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zlacpy_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         STARPU_W,      RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
 
diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c
index c2204d964180306a8fd833b9024e27556779d9b3..27358b138066d2344ecedd839fd9239f86fb706f 100644
--- a/runtime/starpu/codelets/codelet_zlag2c.c
+++ b/runtime/starpu/codelets/codelet_zlag2c.c
@@ -38,7 +38,6 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-
     starpu_codelet_unpack_args(cl_arg, &m, &n);
     TCORE_zlag2c( m, n, tileA, tileB);
 }
@@ -96,7 +95,6 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg)
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-
     starpu_codelet_unpack_args(cl_arg, &m, &n);
     TCORE_clag2z( m, n, tileA, tileB);
 }
diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c
index 0b9620fca00ed3d3fc13245e1f670b05e1261ea4..0dcaa52d54a4033c6b41629f23e98421a5ed72a5 100644
--- a/runtime/starpu/codelets/codelet_zlascal.c
+++ b/runtime/starpu/codelets/codelet_zlascal.c
@@ -36,13 +36,12 @@ struct cl_zlascal_args_s {
 static void
 cl_zlascal_cpu_func( void *descr[], void *cl_arg )
 {
-    struct cl_zlascal_args_s clargs;
+    struct cl_zlascal_args_s *clargs = (struct cl_zlascal_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zlascal( clargs.uplo, clargs.m, clargs.n, clargs.alpha, tileA );
+    TCORE_zlascal( clargs->uplo, clargs->m, clargs->n, clargs->alpha, tileA );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -65,24 +64,29 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
         return;
     }
 
-    struct cl_zlascal_args_s clargs = {
-        .uplo  = uplo,
-        .m     = m,
-        .n     = n,
-        .alpha = alpha,
-        .tileA = A->get_blktile( A, Am, An ),
-    };
+    struct cl_zlascal_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zlascal";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zlascal_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zlascal_callback : NULL;
 
@@ -93,7 +97,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zlascal,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zlascal_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zlascal_args_s),
         STARPU_RW,     RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c
index c3fb53b0fdb0b7732c69d785fbf2c381a4064e60..95805c0142f2b966125bac51b9290bcf0f324bdf 100644
--- a/runtime/starpu/codelets/codelet_zlaset.c
+++ b/runtime/starpu/codelets/codelet_zlaset.c
@@ -39,13 +39,12 @@ struct cl_zlaset_args_s {
 static void
 cl_zlaset_cpu_func( void *descr[], void *cl_arg )
 {
-    struct cl_zlaset_args_s clargs;
+    struct cl_zlaset_args_s *clargs = (struct cl_zlaset_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zlaset( clargs.uplo, clargs.m, clargs.n, clargs.alpha, clargs.beta, tileA );
+    TCORE_zlaset( clargs->uplo, clargs->m, clargs->n, clargs->alpha, clargs->beta, tileA );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -59,25 +58,30 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
                          CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta,
                          const CHAM_desc_t *A, int Am, int An )
 {
-    struct cl_zlaset_args_s clargs = {
-        .uplo  = uplo,
-        .m     = m,
-        .n     = n,
-        .alpha = alpha,
-        .beta  = beta,
-        .tileA = A->get_blktile( A, Am, An ),
-    };
+    struct cl_zlaset_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zlaset";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_W(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zlaset_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->beta  = beta;
+        clargs->tileA = A->get_blktile( A, Am, An );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zlaset_callback : NULL;
 
@@ -88,7 +92,7 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zlaset,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zlaset_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaset_args_s),
         STARPU_W,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c
index 740fdc0fba5bbdcde1ee0258821013d59836ed4b..700352172f44c4f3fdbda09f0474d073cb44bc44 100644
--- a/runtime/starpu/codelets/codelet_zlauum.c
+++ b/runtime/starpu/codelets/codelet_zlauum.c
@@ -37,13 +37,12 @@ struct cl_zlauum_args_s {
 static void
 cl_zlauum_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zlauum_args_s clargs;
+    struct cl_zlauum_args_s *clargs = (struct cl_zlauum_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zlauum( clargs.uplo, clargs.n, tileA );
+    TCORE_zlauum( clargs->uplo, clargs->n, tileA );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -56,22 +55,27 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, int n, int nb,
                          const CHAM_desc_t *A, int Am, int An )
 {
-    struct cl_zlauum_args_s clargs = {
-        .uplo  = uplo,
-        .n     = n,
-        .tileA = A->get_blktile( A, Am, An ),
-    };
+    struct cl_zlauum_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zlauum";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zlauum_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->n     = n;
+        clargs->tileA = A->get_blktile( A, Am, An );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zlauum_callback : NULL;
 
@@ -82,7 +86,7 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zlauum,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zlauum_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zlauum_args_s),
         STARPU_RW,     RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c
index a58aed69b013d4df222b681f7bee42275b98bf53..68bf93fc960e44beeed0468da28694d82562fce5 100644
--- a/runtime/starpu/codelets/codelet_zplghe.c
+++ b/runtime/starpu/codelets/codelet_zplghe.c
@@ -41,14 +41,13 @@ struct cl_zplghe_args_s {
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zplghe_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zplghe_args_s clargs;
+    struct cl_zplghe_args_s *clargs = (struct cl_zplghe_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zplghe( clargs.bump, clargs.m, clargs.n, tileA,
-                  clargs.bigM, clargs.m0, clargs.n0, clargs.seed );
+    TCORE_zplghe( clargs->bump, clargs->m, clargs->n, tileA,
+                  clargs->bigM, clargs->m0, clargs->n0, clargs->seed );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -61,27 +60,32 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options,
                          double bump, int m, int n, const CHAM_desc_t *A, int Am, int An,
                          int bigM, int m0, int n0, unsigned long long int seed )
 {
-    struct cl_zplghe_args_s clargs = {
-        .bump  = bump,
-        .m     = m,
-        .n     = n,
-        .tileA = A->get_blktile( A, Am, An ),
-        .bigM  = bigM,
-        .m0    = m0,
-        .n0    = n0,
-        .seed  = seed,
-    };
+    struct cl_zplghe_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zplghe";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_W(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zplghe_args_s ) );
+        clargs->bump  = bump;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->bigM  = bigM;
+        clargs->m0    = m0;
+        clargs->n0    = n0;
+        clargs->seed  = seed;
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zplghe_callback : NULL;
 
@@ -92,7 +96,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zplghe,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zplghe_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zplghe_args_s),
         STARPU_W,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c
index f3afa23ebea541f9989767f834a1900367ae24ff..2d342bac33f25e4e3e5ff60be624fe0d81b798c5 100644
--- a/runtime/starpu/codelets/codelet_zplgsy.c
+++ b/runtime/starpu/codelets/codelet_zplgsy.c
@@ -41,14 +41,13 @@ struct cl_zplgsy_args_s {
 #if !defined(CHAMELEON_SIMULATION)
 static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zplgsy_args_s clargs;
+    struct cl_zplgsy_args_s *clargs = (struct cl_zplgsy_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zplgsy( clargs.bump, clargs.m, clargs.n, tileA,
-                  clargs.bigM, clargs.m0, clargs.n0, clargs.seed );
+    TCORE_zplgsy( clargs->bump, clargs->m, clargs->n, tileA,
+                  clargs->bigM, clargs->m0, clargs->n0, clargs->seed );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -61,27 +60,32 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options,
                          CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An,
                          int bigM, int m0, int n0, unsigned long long int seed )
 {
-    struct cl_zplgsy_args_s clargs = {
-        .bump  = bump,
-        .m     = m,
-        .n     = n,
-        .tileA = A->get_blktile( A, Am, An ),
-        .bigM  = bigM,
-        .m0    = m0,
-        .n0    = n0,
-        .seed  = seed,
-    };
+    struct cl_zplgsy_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zplgsy";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_W(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zplgsy_args_s ) );
+        clargs->bump  = bump;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->bigM  = bigM;
+        clargs->m0    = m0;
+        clargs->n0    = n0;
+        clargs->seed  = seed;
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zplgsy_callback : NULL;
 
@@ -92,7 +96,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zplgsy,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zplgsy_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zplgsy_args_s),
         STARPU_W,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c
index 05df0d107f8b46bbb6df07891e5373a9308b8f1a..36191f43e2486467220f6ee4cda694cd857c1d4a 100644
--- a/runtime/starpu/codelets/codelet_zplrnt.c
+++ b/runtime/starpu/codelets/codelet_zplrnt.c
@@ -41,14 +41,13 @@ struct cl_zplrnt_args_s {
 static void
 cl_zplrnt_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zplrnt_args_s clargs;
+    struct cl_zplrnt_args_s *clargs = (struct cl_zplrnt_args_s *)cl_arg;
     CHAM_tile_t *tileA;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zplrnt( clargs.m, clargs.n, tileA,
-                  clargs.bigM, clargs.m0, clargs.n0, clargs.seed );
+    TCORE_zplrnt( clargs->m, clargs->n, tileA,
+                  clargs->bigM, clargs->m0, clargs->n0, clargs->seed );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -61,26 +60,31 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
                          int m, int n, const CHAM_desc_t *A, int Am, int An,
                          int bigM, int m0, int n0, unsigned long long int seed )
 {
-    struct cl_zplrnt_args_s clargs = {
-        .m     = m,
-        .n     = n,
-        .tileA = A->get_blktile( A, Am, An ),
-        .bigM  = bigM,
-        .m0    = m0,
-        .n0    = n0,
-        .seed  = seed,
-    };
+    struct cl_zplrnt_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zplrnt";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_W(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zplrnt_args_s ) );
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->bigM  = bigM;
+        clargs->m0    = m0;
+        clargs->n0    = n0;
+        clargs->seed  = seed;
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zplrnt_callback : NULL;
 
@@ -91,7 +95,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zplrnt,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zplrnt_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zplrnt_args_s),
         STARPU_W,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 19da18c46dcac24a734ffcca15693bf13c0682ae..883645b31185692edbac2bba01005f290a738ce7 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -40,17 +40,16 @@ struct cl_zpotrf_args_s {
 static void
 cl_zpotrf_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zpotrf_args_s clargs;
+    struct cl_zpotrf_args_s *clargs = (struct cl_zpotrf_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     int info = 0;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zpotrf( clargs.uplo, clargs.n, tileA, &info );
+    TCORE_zpotrf( clargs->uplo, clargs->n, tileA, &info );
 
-    if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
-        RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info );
+    if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
+        RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info );
     }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -65,25 +64,30 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
                          const CHAM_desc_t *A, int Am, int An,
                          int iinfo )
 {
-    struct cl_zpotrf_args_s clargs = {
-        .uplo     = uplo,
-        .n        = n,
-        .tileA    = A->get_blktile( A, Am, An ),
-        .iinfo    = iinfo,
-        .sequence = options->sequence,
-        .request  = options->request,
-    };
+    struct cl_zpotrf_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "zpotrf";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zpotrf_args_s ) );
+        clargs->uplo     = uplo;
+        clargs->n        = n;
+        clargs->tileA    = A->get_blktile( A, Am, An );
+        clargs->iinfo    = iinfo;
+        clargs->sequence = options->sequence;
+        clargs->request  = options->request;
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zpotrf_callback : NULL;
 
@@ -94,7 +98,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_zpotrf,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zpotrf_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zpotrf_args_s),
         STARPU_RW,     RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index 9af6d9a824498c908b98eb08e8bf1dcd50c665c0..a878aaa19adfc5c60ece27a785a7dd2fd1e67986 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -19,6 +19,7 @@
  * @author Cedric Castagnede
  * @author Lucas Barros de Assis
  * @author Florent Pruvost
+ * @author Gwenole Lucas
  * @date 2021-03-16
  * @precisions normal z -> c d s
  *
@@ -41,38 +42,35 @@ struct cl_zsyrk_args_s {
 static void
 cl_zsyrk_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_zsyrk_args_s clargs;
+    struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_zsyrk( clargs.uplo, clargs.trans, clargs.n, clargs.k,
-                 clargs.alpha, tileA, clargs.beta, tileC );
+    TCORE_zsyrk( clargs->uplo, clargs->trans, clargs->n, clargs->k,
+                 clargs->alpha, tileA, clargs->beta, tileC );
 }
 
 #if defined(CHAMELEON_USE_CUDA)
 static void
 cl_zsyrk_cuda_func(void *descr[], void *cl_arg)
 {
-    struct cl_zsyrk_args_s clargs;
+    struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileC;
 
     tileA = cti_interface_get(descr[0]);
     tileC = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-
     RUNTIME_getStream(stream);
 
     CUDA_zsyrk(
-        clargs.uplo, clargs.trans, clargs.n, clargs.k,
-        (cuDoubleComplex*)&(clargs.alpha),
+        clargs->uplo, clargs->trans, clargs->n, clargs->k,
+        (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
-        (cuDoubleComplex*)&(clargs.beta),
+        (cuDoubleComplex*)&(clargs->beta),
         tileC->mat, tileC->ld,
         stream );
 
@@ -101,28 +99,33 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
                                     beta, C, Cm, Cn );
     }
 
-    struct cl_zsyrk_args_s clargs = {
-        .uplo  = uplo,
-        .trans = trans,
-        .n     = n,
-        .k     = k,
-        .alpha = alpha,
-        .tileA = A->get_blktile( A, Am, An ),
-        .beta  = beta,
-        .tileC = C->get_blktile( C, Cm, Cn ),
-    };
+    struct cl_zsyrk_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid, accessC;
+    int                      exec = 0;
     char                    *cl_name = "zsyrk";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(C, Cm, Cn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_zsyrk_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->n     = n;
+        clargs->k     = k;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->beta  = beta;
+        clargs->tileC = C->get_blktile( C, Cm, Cn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_zsyrk_callback : NULL;
 
@@ -135,9 +138,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
     /* Insert the task */
     rt_starpu_insert_task(
         &cl_zsyrk,
-
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_zsyrk_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyrk_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         accessC,       RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn),
 
diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c
index 6555e6d1737be7bb41244a6539d02c8583f3b072..d59f03a5f3e4e9695c728f4eeb142add91e5192f 100644
--- a/runtime/starpu/codelets/codelet_ztradd.c
+++ b/runtime/starpu/codelets/codelet_ztradd.c
@@ -38,16 +38,15 @@ struct cl_ztradd_args_s {
 static void
 cl_ztradd_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztradd_args_s clargs;
+    struct cl_ztradd_args_s *clargs = (struct cl_ztradd_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_ztradd( clargs.uplo, clargs.trans, clargs.m, clargs.n,
-                  clargs.alpha, tileA, clargs.beta, tileB );
+    TCORE_ztradd( clargs->uplo, clargs->trans, clargs->m, clargs->n,
+                  clargs->alpha, tileA, clargs->beta, tileB );
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
@@ -66,28 +65,33 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
                                     beta, B, Bm, Bn );
     }
 
-    struct cl_ztradd_args_s clargs = {
-        .uplo  = uplo,
-        .trans = trans,
-        .m     = m,
-        .n     = n,
-        .alpha = alpha,
-        .tileA = A->get_blktile( A, Am, An ),
-        .beta  = beta,
-        .tileB = B->get_blktile( B, Bm, Bn ),
-    };
+    struct cl_ztradd_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid, accessB;
+    int                      exec = 0;
     char                    *cl_name = "ztradd";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(B, Bm, Bn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_ztradd_args_s ) );
+        clargs->uplo  = uplo;
+        clargs->trans = trans;
+        clargs->m     = m;
+        clargs->n     = n;
+        clargs->alpha = alpha;
+        clargs->tileA = A->get_blktile( A, Am, An );
+        clargs->beta  = beta;
+        clargs->tileB = B->get_blktile( B, Bm, Bn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_ztradd_callback : NULL;
 
@@ -101,7 +105,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_ztradd,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_ztradd_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_ztradd_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         accessB,       RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
 
diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c
index ac31d898f7e253f3fde11b9f4df37875480eaa5b..20f86bd38af1f4842aa6e853392f00ed7adfbea3 100644
--- a/runtime/starpu/codelets/codelet_ztrmm.c
+++ b/runtime/starpu/codelets/codelet_ztrmm.c
@@ -42,37 +42,34 @@ struct cl_ztrmm_args_s {
 static void
 cl_ztrmm_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztrmm_args_s clargs;
+    struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_ztrmm( clargs.side, clargs.uplo, clargs.transA, clargs.diag,
-                 clargs.m, clargs.n, clargs.alpha, tileA, tileB );
+    TCORE_ztrmm( clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+                 clargs->m, clargs->n, clargs->alpha, tileA, tileB );
 }
 
 #ifdef CHAMELEON_USE_CUDA
 static void
 cl_ztrmm_cuda_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztrmm_args_s clargs;
+    struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-
     RUNTIME_getStream(stream);
 
     CUDA_ztrmm(
-        clargs.side, clargs.uplo, clargs.transA, clargs.diag,
-        clargs.m, clargs.n,
-        (cuDoubleComplex*)&(clargs.alpha),
+        clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+        clargs->m, clargs->n,
+        (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         tileB->mat, tileB->ld,
         stream );
@@ -97,29 +94,34 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn )
 {
-    struct cl_ztrmm_args_s clargs = {
-        .side   = side,
-        .uplo   = uplo,
-        .transA = transA,
-        .diag   = diag,
-        .m      = m,
-        .n      = n,
-        .alpha  = alpha,
-        .tileA  = A->get_blktile( A, Am, An ),
-        .tileB  = B->get_blktile( B, Bm, Bn ),
-    };
+    struct cl_ztrmm_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "ztrmm";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(B, Bm, Bn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_ztrmm_args_s ) );
+        clargs->side   = side;
+        clargs->uplo   = uplo;
+        clargs->transA = transA;
+        clargs->diag   = diag;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->alpha  = alpha;
+        clargs->tileA  = A->get_blktile( A, Am, An );
+        clargs->tileB  = B->get_blktile( B, Bm, Bn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_ztrmm_callback : NULL;
 
@@ -130,7 +132,7 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_ztrmm,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_ztrmm_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrmm_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         STARPU_RW,     RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
 
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 00b45f3ebc2ce576eaaa387a055ea01099299851..e40dde763ed018a29003e39bc1b5ee377fbeb4df 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -19,6 +19,7 @@
  * @author Cedric Castagnede
  * @author Lucas Barros de Assis
  * @author Florent Pruvost
+ * @author Gwenole Lucas
  * @date 2021-03-16
  * @precisions normal z -> c d s
  *
@@ -42,37 +43,34 @@ struct cl_ztrsm_args_s {
 static void
 cl_ztrsm_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztrsm_args_s clargs;
+    struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_ztrsm( clargs.side, clargs.uplo, clargs.transA, clargs.diag,
-                 clargs.m, clargs.n, clargs.alpha, tileA, tileB );
+    TCORE_ztrsm( clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+                 clargs->m, clargs->n, clargs->alpha, tileA, tileB );
 }
 
 #ifdef CHAMELEON_USE_CUDA
 static void
 cl_ztrsm_cuda_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztrsm_args_s clargs;
+    struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg;
     CHAM_tile_t *tileA;
     CHAM_tile_t *tileB;
 
     tileA = cti_interface_get(descr[0]);
     tileB = cti_interface_get(descr[1]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-
     RUNTIME_getStream(stream);
 
     CUDA_ztrsm(
-        clargs.side, clargs.uplo, clargs.transA, clargs.diag,
-        clargs.m, clargs.n,
-        (cuDoubleComplex*)&(clargs.alpha),
+        clargs->side, clargs->uplo, clargs->transA, clargs->diag,
+        clargs->m, clargs->n,
+        (cuDoubleComplex*)&(clargs->alpha),
         tileA->mat, tileA->ld,
         tileB->mat, tileB->ld,
         stream );
@@ -97,29 +95,34 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
                         CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An,
                         const CHAM_desc_t *B, int Bm, int Bn )
 {
-    struct cl_ztrsm_args_s clargs = {
-        .side   = side,
-        .uplo   = uplo,
-        .transA = transA,
-        .diag   = diag,
-        .m      = m,
-        .n      = n,
-        .alpha  = alpha,
-        .tileA  = A->get_blktile( A, Am, An ),
-        .tileB  = B->get_blktile( B, Bm, Bn ),
-    };
+    struct cl_ztrsm_args_s  *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "ztrsm";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_R(A, Am, An);
     CHAMELEON_ACCESS_RW(B, Bm, Bn);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_ztrsm_args_s ) );
+        clargs->side   = side;
+        clargs->uplo   = uplo;
+        clargs->transA = transA;
+        clargs->diag   = diag;
+        clargs->m      = m;
+        clargs->n      = n;
+        clargs->alpha  = alpha;
+        clargs->tileA  = A->get_blktile( A, Am, An );
+        clargs->tileB  = B->get_blktile( B, Bm, Bn );
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_ztrsm_callback : NULL;
 
@@ -130,7 +133,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_ztrsm,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_ztrsm_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrsm_args_s),
         STARPU_R,      RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
         STARPU_RW,     RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn),
 
diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c
index 157fa8c63a6a1d97af856659d42065a6792b7f29..9732e84c36f4c88c310ee35a8028fc66d4c7b572 100644
--- a/runtime/starpu/codelets/codelet_ztrtri.c
+++ b/runtime/starpu/codelets/codelet_ztrtri.c
@@ -41,17 +41,16 @@ struct cl_ztrtri_args_s {
 static void
 cl_ztrtri_cpu_func(void *descr[], void *cl_arg)
 {
-    struct cl_ztrtri_args_s clargs;
+    struct cl_ztrtri_args_s *clargs = (struct cl_ztrtri_args_s *)cl_arg;
     CHAM_tile_t *tileA;
     int info = 0;
 
     tileA = cti_interface_get(descr[0]);
 
-    starpu_codelet_unpack_args( cl_arg, &clargs );
-    TCORE_ztrtri( clargs.uplo, clargs.diag, clargs.n, tileA, &info );
+    TCORE_ztrtri( clargs->uplo, clargs->diag, clargs->n, tileA, &info );
 
-    if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
-        RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info );
+    if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) {
+        RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info );
     }
 }
 #endif /* !defined(CHAMELEON_SIMULATION) */
@@ -66,26 +65,31 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options,
                          const CHAM_desc_t *A, int Am, int An,
                          int iinfo )
 {
-    struct cl_ztrtri_args_s clargs = {
-        .uplo     = uplo,
-        .diag     = diag,
-        .n        = n,
-        .tileA    = A->get_blktile( A, Am, An ),
-        .iinfo    = iinfo,
-        .sequence = options->sequence,
-        .request  = options->request,
-    };
+    struct cl_ztrtri_args_s *clargs = NULL;
     void (*callback)(void*);
     RUNTIME_request_t       *request  = options->request;
     starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt);
     int                      workerid;
+    int                      exec = 0;
     char                    *cl_name = "ztrtri";
 
     /* Handle cache */
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
     CHAMELEON_ACCESS_RW(A, Am, An);
+    exec = __chameleon_need_exec;
     CHAMELEON_END_ACCESS_DECLARATION;
 
+    if ( exec ) {
+        clargs = malloc( sizeof( struct cl_ztrtri_args_s ) );
+        clargs->uplo     = uplo;
+        clargs->diag     = diag;
+        clargs->n        = n;
+        clargs->tileA    = A->get_blktile( A, Am, An );
+        clargs->iinfo    = iinfo;
+        clargs->sequence = options->sequence;
+        clargs->request  = options->request;
+    }
+
     /* Callback fro profiling information */
     callback = options->profiling ? cl_ztrtri_callback : NULL;
 
@@ -96,7 +100,7 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options,
     rt_starpu_insert_task(
         &cl_ztrtri,
         /* Task codelet arguments */
-        STARPU_VALUE, &clargs, sizeof(struct cl_ztrtri_args_s),
+        STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrtri_args_s),
         STARPU_RW,     RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
 
         /* Common task arguments */