diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index cb381cfc153f21234fbc7b3fbea21f5663669627..40c4c24ab4cbd1352d516a6ebae7a8447b3e5f67 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -28,6 +28,7 @@ CHAMELEON_CL_CB(dlag2z,        cti_handle_get_m(task->handles[1]), cti_handle_ge
 CHAMELEON_CL_CB(dzasum,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zaxpy,         cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[1]), 0,                                      M)
 CHAMELEON_CL_CB(zgeadd,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
+CHAMELEON_CL_CB(ztradd,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                  0.5*M*N)
 CHAMELEON_CL_CB(zlascal,       cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
 CHAMELEON_CL_CB(zgelqt,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      (4./3.)*M*N*K)
 CHAMELEON_CL_CB(zgemv,         cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      2. *M*N  )
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index f44ae20438faee407fb7b061c03574ce28694985..9632aef93dd00fe2931832145c15820eda4d2b44 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -21,6 +21,11 @@
 #include <stdlib.h>
 #include "chameleon_starpu.h"
 
+/**
+ * @brief Store the status of some flags to knwo when enable/disable them
+ */
+static int context_starpu_flags = 0;
+
 #if (STARPU_MAJOR_VERSION > 1) || ((STARPU_MAJOR_VERSION == 1) && (STARPU_MINOR_VERSION >= 3))
 /* Defined by StarPU as external function */
 #else
@@ -76,7 +81,12 @@ void RUNTIME_enable( void *runtime_ctxt, int lever )
     case CHAMELEON_DAG:
         fprintf(stderr, "StarPU is providing DAG generation through tracing support (CHAMELEON_PROFILING_MODE)\n");
         break;
+    case CHAMELEON_KERNELPROFILE_MODE:
+        context_starpu_flags |= (1 << CHAMELEON_KERNELPROFILE_MODE);
+        starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
+        break;
     case CHAMELEON_PROFILING_MODE:
+        context_starpu_flags |= (1 << CHAMELEON_PROFILING_MODE);
         starpu_profiling_status_set(STARPU_PROFILING_ENABLE);
         break;
     case CHAMELEON_BOUND:
@@ -101,7 +111,16 @@ void RUNTIME_disable( void *runtime_ctxt, int lever )
         fprintf(stderr, "StarPU is providing DAG generation through tracing support (CHAMELEON_PROFILING_MODE)\n");
         break;
     case CHAMELEON_PROFILING_MODE:
-        starpu_profiling_status_set(STARPU_PROFILING_DISABLE);
+        context_starpu_flags |= ~(1 << CHAMELEON_PROFILING_MODE);
+        if ( !context_starpu_flags ) {
+            starpu_profiling_status_set(STARPU_PROFILING_DISABLE);
+        }
+        break;
+    case CHAMELEON_KERNELPROFILE_MODE:
+        context_starpu_flags |= ~(1 << CHAMELEON_KERNELPROFILE_MODE);
+        if ( !context_starpu_flags ) {
+            starpu_profiling_status_set(STARPU_PROFILING_DISABLE);
+        }
         break;
     case CHAMELEON_BOUND:
         starpu_bound_stop();
diff --git a/runtime/starpu/include/runtime_codelet_profile.h b/runtime/starpu/include/runtime_codelet_profile.h
index 20d954fdd2cda5e2071f0c945aaf9de297051a7f..8ff80dab03cfc5a3a5a20037345efcb3712c75d6 100644
--- a/runtime/starpu/include/runtime_codelet_profile.h
+++ b/runtime/starpu/include/runtime_codelet_profile.h
@@ -22,6 +22,7 @@
 #define _runtime_codelet_profile_h_
 
 #include <math.h>
+#include <assert.h>
 
 #define CHAMELEON_CL_CB(name, _m, _n, _k, _nflops)			\
     static measure_t name##_perf[STARPU_NMAXWORKERS];                                          \
@@ -34,6 +35,7 @@
         __attribute__ ((unused)) double K = (double)(_k);                                      \
         double flops = (_nflops);                                                              \
         struct starpu_profiling_task_info *info = task->profiling_info;                        \
+        assert( info != NULL );                                                                \
         double duration = starpu_timing_timespec_delay_us(&info->start_time, &info->end_time); \
         double speed = flops/(1000.0*duration);                                                \
         name##_perf[info->workerid].sum  += speed;                                             \