diff --git a/CMakeLists.txt b/CMakeLists.txt
index a91eb338de119d11a1251be5894ba8a742a688c0..9fb0b898f46e4470bb842336ad1572548ffe351d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,5 +1,8 @@
 cmake_minimum_required (VERSION 3.7)
 project (Tikki)
+
+list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_LIST_DIR}/cmake)
+
 include_directories ("includes")
 include_directories("poti/include" "poti/src")
 
diff --git a/cmake/FindPAPI.cmake b/cmake/FindPAPI.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..86cbf22ae1f1c80cb4ba6db874a35cb5eb7bb2bc
--- /dev/null
+++ b/cmake/FindPAPI.cmake
@@ -0,0 +1,45 @@
+# Try to find PAPI headers and libraries.
+#
+# Usage of this module as follows:
+#
+#     find_package(PAPI)
+#
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+#  PAPI_PREFIX         Set this variable to the root installation of
+#                      libpapi if the module has problems finding the
+#                      proper installation path.
+#
+# Variables defined by this module:
+#
+#  PAPI_FOUND              System has PAPI libraries and headers
+#  PAPI_LIBRARIES          The PAPI library
+#  PAPI_INCLUDE_DIRS       The location of PAPI headers
+
+find_path(PAPI_PREFIX
+    NAMES include/papi.h
+)
+
+find_library(PAPI_LIBRARIES
+    # Pick the static library first for easier run-time linking.
+    NAMES libpapi.so libpapi.a papi
+    HINTS ${PAPI_PREFIX}/lib ${HILTIDEPS}/lib
+)
+
+find_path(PAPI_INCLUDE_DIRS
+    NAMES papi.h
+    HINTS ${PAPI_PREFIX}/include ${HILTIDEPS}/include
+)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(PAPI DEFAULT_MSG
+    PAPI_LIBRARIES
+    PAPI_INCLUDE_DIRS
+)
+
+mark_as_advanced(
+    PAPI_PREFIX_DIRS
+    PAPI_LIBRARIES
+    PAPI_INCLUDE_DIRS
+)
diff --git a/config/git_hash.sh b/config/git_hash.sh
index ea5d3ebbc482fc5b68cc959875ca8d519d19f2c0..7a1d43e2b21484effe46594b1f426a4471e7062c 100755
--- a/config/git_hash.sh
+++ b/config/git_hash.sh
@@ -1,2 +1,2 @@
 #!/bin/bash
-echo "#define GIT_HASH" \"`git describe --always --dirty=+ --tags --long --abbrev=16`\" > $1
+cd $1; echo "#define GIT_HASH" \"`git describe --always --dirty=+ --tags --long --abbrev=16`\" > $2
diff --git a/includes/kaapi_trace.h b/includes/kaapi_trace.h
index 3f653e724cd5622924473e79de7cc6c6332b54c3..53051f2ee6b547fcd8cef16f8a68ee9fc55c3942 100644
--- a/includes/kaapi_trace.h
+++ b/includes/kaapi_trace.h
@@ -47,16 +47,11 @@
 
 #include <stdint.h>
 #include <stddef.h>
-#include "hw_count.h" 
 
 #if defined(__cplusplus)
 extern "C" {
 #endif
 
-#if !defined(KAAPI_MAX_HWCOUNTERS)
-#define KAAPI_MAX_HWCOUNTERS 4
-#endif
-
 #if !defined(KAAPI_CACHE_LINE)
 #define KAAPI_CACHE_LINE 64
 #endif
@@ -161,10 +156,6 @@ extern double kaapi_get_elapsedtime(void);
 #define KAAPI_PERF_GROUP_DFGBUILD  4
 #define KAAPI_PERF_GROUP_OFFLOAD   5
 
-#if ((KAAPI_PERF_ID_ENDSOFTWARE+KAAPI_MAX_HWCOUNTERS) > KAAPI_PERF_ID_MAX)
-#error "The maximal size of the peformance counters handled by Kaapi should be extended. Please contact the authors."
-#endif
-
 
 /* counter type
 */
@@ -241,8 +232,8 @@ typedef struct kaapi_named_perfctr {
 #define KAAPI_EVT_TASK_BEG           2     /* begin execution of tasks */
 #define KAAPI_EVT_TASK_END           3     /* end execution of tasks, d0: task, d1: numaid */
 #define KAAPI_EVT_TASK_SUCC          4     /* T0 has successor T1 */
-#define KAAPI_EVT_TASK_ACCESS        5     /* d0: task, d1: mode, d2: pointer */
-#define KAAPI_EVT_COMP_DAG           6     /* computing the dag, i0[0]=1 iff beg, =0 iff else. d1: key */
+#define KAAPI_EVT_TASK_ACCESS        5     /* d0: task, d1: mode, d2: pointer, d3: numaid */
+#define KAAPI_EVT_TASK_DATA          6     /* d0: task, d1: data, d2: size, d3.i32[0]: mode d3.i32[1]: numaid */
 /*#define KAAPI_EVT_FREE0            7*/
 #define KAAPI_EVT_UNDEFINED_0        8
 #define KAAPI_EVT_UNDEFINED_1        9
@@ -258,19 +249,19 @@ typedef struct kaapi_named_perfctr {
 #define KAAPI_EVT_STEAL_OP           19    /* when k-processor emit a steal request data=victimid, serial*/
 #define KAAPI_EVT_STEAL_AGGR_BEG     20    /* when begin to be a combiner */
 #define KAAPI_EVT_STEAL_AGGR_END     21    /* when begin to be a combiner */
-#define KAAPI_EVT_OFFLOAD_HTOH_BEG   22 /* offload copy */
+#define KAAPI_EVT_OFFLOAD_HTOH_BEG   22    /* offload copy */
 #define KAAPI_EVT_OFFLOAD_HTOH_END   23
-#define KAAPI_EVT_OFFLOAD_HTOD_BEG   24 /* offload copy */
+#define KAAPI_EVT_OFFLOAD_HTOD_BEG   24    /* offload copy */
 #define KAAPI_EVT_OFFLOAD_HTOD_END   25
-#define KAAPI_EVT_OFFLOAD_DTOH_BEG   26 /* offload copy */
+#define KAAPI_EVT_OFFLOAD_DTOH_BEG   26    /* offload copy */
 #define KAAPI_EVT_OFFLOAD_DTOH_END   27
-#define KAAPI_EVT_OFFLOAD_DTOD_BEG   28 /* offload copy */
+#define KAAPI_EVT_OFFLOAD_DTOD_BEG   28    /* offload copy */
 #define KAAPI_EVT_OFFLOAD_DTOD_END   29
 #define KAAPI_EVT_OFFLOAD_KERNEL_BEG 30
 #define KAAPI_EVT_OFFLOAD_KERNEL_END 31
-#define KAAPI_EVT_PARALLEL           32 /* i0[] = 1 iff beg, = 0 iff end, d1: parallel_id */
+#define KAAPI_EVT_PARALLEL           32    /* i0[] = 1 iff beg, = 0 iff end, d1: parallel_id */
 /*#define KAAPI_EVT_FREE0            33*/
-#define KAAPI_EVT_TASKWAIT           34 /* i0[] = 1 iff beg, = 0 iff end, d1: task_id */
+#define KAAPI_EVT_TASKWAIT           34    /* i0[] = 1 iff beg, = 0 iff end, d1: task_id */
 /*#define KAAPI_EVT_FREE0            35*/
 #define KAAPI_EVT_TASKGROUP          36
 /*#define KAAPI_EVT_FREE0            37*/
@@ -319,9 +310,9 @@ typedef uint64_t kaapi_event_mask_type_t;
     (  KAAPI_EVT_MASK(KAAPI_EVT_TASK_BEG) \
      | KAAPI_EVT_MASK(KAAPI_EVT_TASK_SUCC) \
      | KAAPI_EVT_MASK(KAAPI_EVT_TASK_ACCESS) \
+     | KAAPI_EVT_MASK(KAAPI_EVT_TASK_DATA) \
      | KAAPI_EVT_MASK(KAAPI_EVT_TASK_END) \
      | KAAPI_EVT_MASK(KAAPI_EVT_TASK_STEAL) \
-     | KAAPI_EVT_MASK(KAAPI_EVT_COMP_DAG) \
      | KAAPI_EVT_MASK(KAAPI_EVT_OFFLOAD_KERNEL_BEG) \
      | KAAPI_EVT_MASK(KAAPI_EVT_OFFLOAD_KERNEL_END) \
     )
@@ -337,7 +328,6 @@ typedef uint64_t kaapi_event_mask_type_t;
      | KAAPI_EVT_MASK(KAAPI_EVT_LOOP_NEXT) \
      | KAAPI_EVT_MASK(KAAPI_EVT_LOOP_MDATA) \
      | KAAPI_EVT_MASK(KAAPI_EVT_TASK_ATTR) \
-     | KAAPI_EVT_MASK(KAAPI_EVT_COMP_DAG) \
     )
 
 #define KAAPI_EVT_MASK_SCHED \
@@ -547,13 +537,12 @@ typedef enum kaapi_access_mode_t {
   KAAPI_ACCESS_MODE_S   = 16,       /* 0001 0000 : stack data */
   KAAPI_ACCESS_MODE_T   = 32,       /* 0010 0000 : for Quark support: scratch mode or temporary */
   KAAPI_ACCESS_MODE_P   = 64,       /* 0100 0000 : */
-  KAAPI_ACCESS_MODE_IP  = 128,      /* 1000 0000 : in place, for CW only */
+  KAAPI_ACCESS_MODE_C   = 128,      /* 1000 0000 : commute only - for CW only - */
 
   KAAPI_ACCESS_MODE_RW  = KAAPI_ACCESS_MODE_R|KAAPI_ACCESS_MODE_W,
   KAAPI_ACCESS_MODE_STACK = KAAPI_ACCESS_MODE_S|KAAPI_ACCESS_MODE_RW,
   KAAPI_ACCESS_MODE_SCRATCH = KAAPI_ACCESS_MODE_T|KAAPI_ACCESS_MODE_V,
   KAAPI_ACCESS_MODE_CWP = KAAPI_ACCESS_MODE_P|KAAPI_ACCESS_MODE_CW,
-  KAAPI_ACCESS_MODE_ICW = KAAPI_ACCESS_MODE_IP|KAAPI_ACCESS_MODE_CW
 } kaapi_access_mode_t;
 
 #define KAAPI_ACCESS_MASK_RIGHT_MODE   0x7F   /* 5 bits, ie bit 0, 1, 2, 3, 4, including P mode */
@@ -890,6 +879,19 @@ extern void kaapi_tracelib_task_access(
 );
 
 
+/* 
+*/
+extern void kaapi_tracelib_task_data(
+    kaapi_tracelib_thread_t*     kproc,
+    kaapi_task_id_t              task,
+    int                          count,
+    void**                       data,
+    size_t*                      size,
+    int*                         mode,
+    void                        (*ompt_mode_decoder)(int,int*)
+);
+
+
 /* Thread's synchronization related functions
 */
 extern void kaapi_tracelib_barrier_begin(
diff --git a/scripts/create_gantt.R b/scripts/create_gantt.R
new file mode 100644
index 0000000000000000000000000000000000000000..4ffa9f010673cbb646c4a52afb7e95d0fb0f52b4
--- /dev/null
+++ b/scripts/create_gantt.R
@@ -0,0 +1,47 @@
+library(dplyr);
+library(ggplot2);
+
+readtrace <- function (filename)
+{
+   df <- read.csv(filename, header=TRUE, sep=",", strip.white=TRUE);
+   df <- df %>% filter((Explicit==1)) %>% as.data.frame();
+   df$Start <- df$Start*1e-9; # Convert ns to second
+   df$End <- df$End*1e-9;
+   df$Duration <- df$Duration*1e-9;
+   df;
+}
+
+args <- commandArgs(trailingOnly=TRUE)
+
+
+df <- readtrace(args[1]);
+
+
+# helper: convert s to the date
+date<-function(d) { as.POSIXct(d, origin="1970-01-01"); }
+
+# theplot
+theplot = ggplot() +
+  theme_bw(base_size=16) +
+   xlab("Time [s]") +
+   ylab("Thread Identification") +
+   scale_fill_brewer(palette = "Set1") +
+   theme (
+       legend.spacing = unit(.1, "line"),
+       panel.grid.major = element_blank(),
+       panel.spacing=unit(0, "cm"),
+       panel.grid=element_line(size=0),
+       legend.position = "bottom",
+       legend.title =  element_text("Helvetica")
+   ) +
+   guides(fill = guide_legend(nrow = 1)) +
+   geom_rect(data=df, alpha=1, aes(fill=Name,
+                                 xmin=date(Start),
+                                 xmax=date(End),
+                                 ymin=Resource,
+                                 ymax=Resource+0.9)) +
+   scale_y_reverse();
+ pdf("gantt.pdf", width=10, height=6)
+ print(theplot)
+ dev.off()
+
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 84989b7412b1b87affc3520d271ba9edc02213c2..2d5f30616453f2a88e9e7b10acfda7b464b1c2c5 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -4,3 +4,4 @@ set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
 
 add_executable(hello hello.c)
 add_executable(hello-task hello-task.c)
+target_link_libraries(hello-task m)
diff --git a/test/hello-task.c b/test/hello-task.c
index 3ff5a30bece771085ef65ecf992cecc1efd9b5e6..b18738e0fd7f3ec5aaf7f34c9945347201009ced 100644
--- a/test/hello-task.c
+++ b/test/hello-task.c
@@ -1,6 +1,7 @@
 #include <stdio.h>
 #include <unistd.h>
 #include <omp.h>
+#include <math.h>
 
 int array[] = { 1, 2, 3, 4};
 
@@ -18,7 +19,10 @@ int main()
     for (int i = 0; i < 4; i++) {
       #pragma omp task depend(in: array[i]) depend(inout: array[(i+1)%4])
       {
+        double d;
         array[(i+1)%4] = array[i];
+        for (int j=0; j<100000; ++j) 
+          d += sin(M_PI/j)*cos(M_PI/i);
         printf("Hey %i\n", i);
       }
     }
diff --git a/tool/CMakeLists.txt b/tool/CMakeLists.txt
index e42dac0e27c166fcd7abbb7c2a605808e715e913..2db4edd2ea890a884015992cc419ca19c2351bdf 100644
--- a/tool/CMakeLists.txt
+++ b/tool/CMakeLists.txt
@@ -1,4 +1,5 @@
-find_file(OMPT_HEADER NAMES omp-tools.h)
+unset(OMPT_HEADER CACHE)
+find_file(OMPT_HEADER NAMES omp-tools.h HINTS ENV CPLUS_INCLUDE_PATH ENV C_INCLUDE_PATH)
 if (${OMPT_HEADER} STREQUAL "OMPT_HEADER-NOTFOUND")
   message(FATAL_ERROR "The OpenMP's OMPT header (omp-tools.h) was not found. Please check your OpenMP runtime installation.")
 endif()
@@ -8,13 +9,8 @@ set(SOURCES
   ompt_callbacks.cpp
   )
 
-add_custom_command (
-  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h
-  COMMAND ${SHELL_EXECUTABLE} ${CMAKE_SOURCE_DIR}/config/hw_count.sh ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h
-  )
-
 include_directories( ${CMAKE_CURRENT_BINARY_DIR} )
 
-add_library(tikki SHARED ${SOURCES} ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h)
+add_library(tikki SHARED ${SOURCES} )
 target_link_libraries (tikki tracelib)
 install(TARGETS tikki DESTINATION lib)
diff --git a/tool/ompt_callbacks.cpp b/tool/ompt_callbacks.cpp
index f73787d78bcd66c279d8340a032f9de1edf2bb48..82c7018261ae466fbc1402370694aaefeba956cc 100644
--- a/tool/ompt_callbacks.cpp
+++ b/tool/ompt_callbacks.cpp
@@ -73,19 +73,25 @@
 #define LOG 0
 
 
-#if 0
-static ompt_get_task_id_t ompt_get_task_id;
-static ompt_get_thread_id_t ompt_get_thread_id;
-static ompt_get_parallel_id_t ompt_get_parallel_id;
-#endif
-
 std::atomic<uint64_t> unique_thread_id(1);
+std::atomic<uint64_t> unique_parallel_id(1);
+
+#define MAX_PARAM 16
+typedef struct { 
+  int     count;
+  void*   data[MAX_PARAM];
+  size_t  size[MAX_PARAM];
+  int     mode[MAX_PARAM];
+} task_data_info_t;
+
 
 typedef struct tikki_task_id_s {
   uint64_t id;
   const void *task_ptr;
   ompt_dependence_t *deps;
   int ndeps;
+  const char* name;
+  task_data_info_t datainfo;
 } tikki_task_id_t;
 
 
@@ -176,6 +182,49 @@ return_fast:
   return output;
 }
 
+
+/*
+*/
+static void ompt_mode_decoder( ompt_dependence_type_t odt, int* mode )
+{
+  *mode = KAAPI_ACCESS_MODE_VOID;
+  switch (odt)
+  {
+    case ompt_dependence_type_in:
+    {
+      *mode =KAAPI_ACCESS_MODE_R;
+      break;
+    }
+    case ompt_dependence_type_out:
+    {
+      *mode =KAAPI_ACCESS_MODE_W;
+      break;
+    }
+    case ompt_dependence_type_inout:
+    {
+      *mode =KAAPI_ACCESS_MODE_R|KAAPI_ACCESS_MODE_W;
+      break;
+    }
+    case ompt_dependence_type_mutexinoutset:
+    {
+      *mode =KAAPI_ACCESS_MODE_C|KAAPI_ACCESS_MODE_CW;
+      break;
+    }
+    default:
+    {
+      static int alreadydisplay = 0;
+      if (alreadydisplay ==0)
+      {
+        printf("*** OpenMP dependence type: %i not implemented\n", odt );
+        alreadydisplay = 1;
+      }
+    }
+  }
+}
+
+
+/*
+*/
 static void ompt_decoder( ompt_dependence_t* dep, int i, void** addr, size_t* len, int* mode /*, size_t* len */ )
 {
   *addr = dep[i].variable.ptr;
@@ -190,6 +239,7 @@ static void ompt_decoder( ompt_dependence_t* dep, int i, void** addr, size_t* le
     *mode =KAAPI_ACCESS_MODE_R|KAAPI_ACCESS_MODE_W;
   //if (dep[i].dependence_flags.commute)
     //*mode |=KAAPI_ACCESS_MODE_CW;
+  //ompt_mode_decoder(dep[i].dependence_type, mode );
 }
 
 
@@ -288,7 +338,8 @@ void ompt_callback_parallel_begin_action (
 )
 {
   ompt_data_t *thread_data = ompt_get_thread_data();
-  parallel_data->value = ompt_get_unique_id();
+  //parallel_data->value = ompt_get_unique_id();
+  parallel_data->value = unique_parallel_id++;
   uint64_t thread_id = thread_data->value;
   tikki_task_id_t *task = (tikki_task_id_t *)encountering_task_data->ptr;
 
@@ -364,6 +415,33 @@ ompt_callback_parallel_end_action(
 }
 
 
+/*
+*/
+__thread const char* next_name = 0;
+extern "C" 
+void tikki_ompt_set_task_name(const char* name )
+{
+  next_name = name;
+}
+
+
+/*
+*/
+__thread task_data_info_t next_data_info = { 0 };
+  
+extern "C" 
+void tikki_ompt_set_task_data(int count, void** data, size_t* size, int* mode ) 
+{
+#if LOG
+printf("In %s: count: %i, data[0]:%p, size[0]: %li, mode[0]: %i\n", __func__, 
+    count, data[0], size[0], mode[0] );
+#endif
+  next_data_info.count = count;
+  memcpy( &next_data_info.data, data, sizeof(void*)*count );
+  memcpy( &next_data_info.size, size, sizeof(size_t)*count );
+  memcpy( &next_data_info.mode, mode, sizeof(int)*count );
+}
+
 void
 ompt_callback_task_create_action(
     ompt_data_t *parent_task_data,    /* id of parent task            */
@@ -379,11 +457,14 @@ ompt_callback_task_create_action(
   task->id = ompt_get_unique_id();
   task->task_ptr = codeptr_ra;
   task->ndeps = 0;
+  task->name = next_name;
+  task->datainfo = next_data_info;
+  next_name = 0;
 #if LOG
   if (parent_task_data) {
     tikki_task_id_t *parent_task = (tikki_task_id_t*)parent_task_data->ptr;
     printf("%" PRIu64 ": ompt_task_create: parent_id=%" PRIu64 ", task_id=%" PRIu64\
-           ", type=%i, has_dep=%i, ptr=%" PRIu64 "\n", thread_id,
+           ", type=%i, has_dep=%i, ptr=%p\n", thread_id,
            parent_task->id, task->id, type,
            has_dependences, task->task_ptr);
   } else {
@@ -432,24 +513,32 @@ ompt_callback_task_schedule_action(
     kaapi_tracelib_thread_switchstate(koti->kproc);
     //free(prior_task);
   }
-  if (next_task_data->ptr) {
+  if (next_task_data->ptr) 
+{
     tikki_task_id_t *new_task = (tikki_task_id_t *)next_task_data->ptr;
     // We are starting a task
 #if LOG
     printf("%" PRIu64 ": ompt_event_task_begin: task_id=%" PRIu64 ", status=%i, deps: %p\n", thread_id, new_task->id, prior_task_status, new_task->deps);
 #endif
-    /* This is code for implicit task begin.
+    /* This is code for explicit task begin.
     */
     kaapi_ompt_thread_info_t* koti = &__kaapi_oth_info[thread_id];
     kaapi_tracelib_thread_switchstate(koti->kproc);
     char buff[30];
-    sprintf(buff, "<undef-%p-%i>\0", new_task->task_ptr, new_task->id);
+    const char* taskname = 0;
+    if (new_task->name == 0)
+    {
+      sprintf(buff, "<undef-%p-%i>\0", new_task->task_ptr, new_task->id);
+      taskname = buff;
+    }
+    else
+      taskname = new_task->name;
     kaapi_descrformat_t* fdescr = kaapi_tracelib_register_fmtdescr(
         0,
         // TODO: get codeptr_ra there
         (void *)new_task->task_ptr,
         0, //loc
-        buff,
+        taskname,
         libomp_filter_func
         );
     int idxtop = koti->pstack.top;
@@ -478,6 +567,19 @@ ompt_callback_task_schedule_action(
           (void (*)(void*, int, void**, size_t*, int*))ompt_decoder
       );
     }
+    if (new_task->datainfo.count)
+    {
+      kaapi_tracelib_task_data(
+          koti->kproc,
+          (kaapi_task_id_t)new_task->id,
+          new_task->datainfo.count,
+          new_task->datainfo.data,
+          new_task->datainfo.size,
+          new_task->datainfo.mode,
+          (void (*)(int, int*))ompt_mode_decoder
+      );
+      new_task->datainfo.count = 0;
+    }
   }
 }
 
@@ -501,13 +603,15 @@ ompt_callback_implicit_task_action (
 #if LOG
     printf("%" PRIu64 ": ompt_event_implicit_task_action: begin. parallel_id=%" PRIu64 ", task_id=%" PRIu64 "\n", thread_id, parallel_data->value, task->id);
 #endif
+    char buff[30];
+    sprintf(buff, "<implicit>") ;
     kaapi_ompt_thread_info_t* koti = &__kaapi_oth_info[thread_id];
     kaapi_descrformat_t* fdescr = kaapi_tracelib_register_fmtdescr(
         1,
         //team->key, [> same key as the team. Not implicit task ? why ? [TG] <]
         (void *)task->id,
         //team->name,
-        "<implicit>",
+        buff,
         0,
         0          /* no filter: team name should be already well formed */
         );
@@ -580,19 +684,46 @@ ompt_callback_implicit_task_action (
 #endif
 }
 
+
+
 void
 ompt_callback_dependences_action (
     ompt_data_t *task_data,
     const ompt_dependence_t *deps,
     int ndeps
-    )
+)
 {
   tikki_task_id_t *task = (tikki_task_id_t *)task_data->ptr;
   uint64_t thread_id = ompt_get_thread_data()->value;
   task->deps = (ompt_dependence_t *)malloc(ndeps * sizeof(ompt_dependence_t));
 #if LOG
-  printf("%" PRIu64 ": ompt_event_task_dependences: tsak_id=%" PRIu64 ", #deps=%i, deps: %p\n", thread_id, task->id, ndeps, task->deps);
+  printf("%" PRIu64 ": ompt_event_dependences: tsak_id=%" PRIu64 ", #deps=%i, deps: %p\n", thread_id, task->id, ndeps, task->deps);
 #endif
   task->ndeps = ndeps;
   memcpy(task->deps, deps, ndeps*sizeof(ompt_dependence_t));
 }
+
+
+
+void
+ompt_callback_task_dependence_action (
+  ompt_data_t *src_task_data,
+  ompt_data_t *sink_task_data
+)
+{
+  tikki_task_id_t *src_task = (tikki_task_id_t *)src_task_data->ptr;
+  tikki_task_id_t *dest_task = (tikki_task_id_t *)sink_task_data->ptr;
+  uint64_t thread_id = ompt_get_thread_data()->value;
+  kaapi_ompt_thread_info_t* koti = &__kaapi_oth_info[thread_id];
+
+  kaapi_tracelib_task_depend(
+    koti->kproc,
+    (kaapi_task_id_t)src_task->id,
+    (kaapi_task_id_t)dest_task->id
+  );
+
+#if LOG
+  printf("%" PRIu64 ": ompt_event_task_dependence: tsak_id=%" PRIu64 ", sink: %" PRIu64 "\n", thread_id, src_task->id, dest_task->id);
+#endif
+}
+
diff --git a/tool/ompt_callbacks.def b/tool/ompt_callbacks.def
index 99a78d7a494d157f3b74a858a8d438dda3255457..e4b2b7113131204f354e3567a50ee060491680eb 100644
--- a/tool/ompt_callbacks.def
+++ b/tool/ompt_callbacks.def
@@ -13,6 +13,7 @@ CALLBACK(ompt_callback_task_create, ompt_data_t*, const ompt_frame_t*, ompt_data
 CALLBACK(ompt_callback_task_schedule, ompt_data_t *, ompt_task_status_t, ompt_data_t *)
 CALLBACK(ompt_callback_implicit_task, ompt_scope_endpoint_t, ompt_data_t *, ompt_data_t*, unsigned int, unsigned int, int)
 CALLBACK(ompt_callback_dependences, ompt_data_t *, const ompt_dependence_t *, int)
+CALLBACK(ompt_callback_task_dependence, ompt_data_t *, ompt_data_t * )
 UNUSED(ompt_callback_target)
 UNUSED(ompt_callback_target_data_op)
 UNUSED(ompt_callback_target_submit)
@@ -23,8 +24,6 @@ UNUSED(ompt_callback_device_load)
 UNUSED(ompt_callback_device_unload)
 UNUSED(ompt_callback_sync_region_wait)
 UNUSED(ompt_callback_mutex_released)
-UNUSED(ompt_callback_dependences)
-UNUSED(ompt_callback_task_dependence)
 UNUSED(ompt_callback_work)
 UNUSED(ompt_callback_master)
 UNUSED(ompt_callback_target_map)
diff --git a/tool/tool.cpp b/tool/tool.cpp
index 6a17401bd6c26f5836f26bc1b46ea10734777201..1f68149ae26dc2080f3985c58e18fc8f711eda21 100644
--- a/tool/tool.cpp
+++ b/tool/tool.cpp
@@ -18,19 +18,47 @@ ompt_get_unique_id_t ompt_get_unique_id;
 #include <atomic>
 std::atomic<uint64_t> myuid(0);
 
-void initTool(ompt_function_lookup_t lookup) {
-  printf("init tool\n");
+
+
+extern "C" {
+
+/* Internal function
+*/
+extern void tikki_ompt_set_task_name(const char* name );
+extern void tikki_ompt_set_task_data(int count, void** data, size_t* size, int* mode );
+
+void __tikki_ompt_set_task_name(const char* name )
+{ tikki_ompt_set_task_name(name); }
+void __tikki_ompt_set_task_data(int count, void** data, size_t* size, int* mode )
+{ tikki_ompt_set_task_data(count, data, size, mode); }
+
+/* Exported function as extension to be called by application
+*/
+extern void ompt_set_task_name(const char* name ) 
+{ 
+  tikki_ompt_set_task_name(name);
+}
+
+//__attribute__ ((weak, alias ("__tikki_ompt_set_task_name")));
+extern void ompt_set_task_data(int count, void** data, size_t* size, int* mode ) 
+{
+  tikki_ompt_set_task_data(count, data, size, mode);
+}
+//__attribute__ ((weak, alias ("__tikki_ompt_set_task_data")));
+} // extern "Cc
+
+
+
+void initTool(ompt_function_lookup_t lookup) 
+{
   int err = 0;
   __kaapi_oth_info = (kaapi_ompt_thread_info_t*)calloc(__kaapi_oth_info_capacity, sizeof(kaapi_ompt_thread_info_t));
-  err = kaapi_tracelib_init(
-      getpid()
-      );
-
+  err = kaapi_tracelib_init( getpid() );
   if (err !=0)
-    printf("[OMP-TRACE] kaapi tracing, init error:%i, version: %s\n", err, get_kaapi_version());
+    printf("[OMP-TRACE] TiKKi tracing, init error:%i, version: %s\n", err, get_kaapi_version());
   else
-    printf("[OMP-TRACE] kaapi tracing version: %s\n",get_kaapi_version());
-  printf("oth_info %p\n", __kaapi_oth_info);
+    printf("[OMP-TRACE] TiKKi tracing version: '%s'\n",get_kaapi_version());
+  //printf("oth_info %p\n", __kaapi_oth_info);
   ompt_set_callback_t ompt_set_callback = (ompt_set_callback_t) lookup("ompt_set_callback");
   ompt_get_thread_data = (ompt_get_thread_data_t) lookup("ompt_get_thread_data");
   ompt_get_unique_id = (ompt_get_unique_id_t) lookup("ompt_get_unique_id");
@@ -39,6 +67,9 @@ void initTool(ompt_function_lookup_t lookup) {
     return ++myuid;
   };
 
+  //ompt_set_task_name = tikki_ompt_set_task_name;
+  //ompt_set_task_data = tikki_ompt_set_task_data;
+
 #define CALLBACK(name, ...)                       \
   do{                                                           \
     if (ompt_set_callback(name, (ompt_callback_t)name##_action) ==   \
@@ -63,14 +94,14 @@ extern "C" {
   {
     kaapi_tracelib_fini();
     free(__kaapi_oth_info);
-    fprintf(stderr, "Exiting Tikki tool\n");
+    fprintf(stderr, "[OMP-TRACE] Exiting Tikki tool\n");
   }
 
   ompt_start_tool_result_t* ompt_start_tool(
       unsigned int omp_version,
       const char *runtime_version)
   {
-    fprintf(stderr, "Loading Tikki tool\n");
+    fprintf(stderr, "[OMP-TRACE] Loading TiKKi tool\n");
     static ompt_start_tool_result_t ompt_start_tool_result = {&ompt_initialize,&ompt_finalize,0};
     return &ompt_start_tool_result;
   }
diff --git a/tracelib/CMakeLists.txt b/tracelib/CMakeLists.txt
index e91301da00c3cafe27a5a81739062e4536d9b235..cb32ab232f5755f63215c33edf7521a6ebe25cca 100644
--- a/tracelib/CMakeLists.txt
+++ b/tracelib/CMakeLists.txt
@@ -1,6 +1,6 @@
 add_custom_command (
   OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/git_hash.h
-  COMMAND ${SHELL_EXECUTABLE} ${CMAKE_SOURCE_DIR}/config/git_hash.sh ${CMAKE_CURRENT_BINARY_DIR}/git_hash.h
+  COMMAND ${SHELL_EXECUTABLE} ${CMAKE_SOURCE_DIR}/config/git_hash.sh ${CMAKE_SOURCE_DIR} ${CMAKE_CURRENT_BINARY_DIR}/git_hash.h
   )
 
 add_custom_command (
@@ -11,8 +11,21 @@ add_custom_command (
 set(THREADS_PREFER_PTHREAD_FLAG ON)
 find_package(Threads REQUIRED)
 
-include_directories( ${CMAKE_CURRENT_BINARY_DIR} )
+find_package(PAPI)
+if (${PAPI_FOUND})
+  add_compile_options("-DKAAPI_USE_PAPI=1")
+else()
+  add_compile_options("-DKAAPI_USE_PAPI=0")
+endif()
+
+include_directories(${CMAKE_CURRENT_BINARY_DIR})
 
 add_library(tracelib SHARED kaapi_recorder.c kaapi_rt.c kaapi_trace_lib.c kaapi_hashmap.c kaapi_trace_rt.c kaapi_parser.c ${CMAKE_CURRENT_BINARY_DIR}/git_hash.h ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h)
 target_link_libraries(tracelib Threads::Threads)
+
+if (${PAPI_FOUND})
+  include_directories(${PAPI_INCLUDE_DIRS})
+  target_link_libraries(tracelib ${PAPI_LIBRARIES})
+endif()
+
 install(TARGETS tracelib DESTINATION lib)
diff --git a/tracelib/kaapi_recorder.c b/tracelib/kaapi_recorder.c
index dd91fd3dda955394be7bb98ddd2039d272431c75..8a9008de5f6b453ea3f42df66809da07830fda66 100644
--- a/tracelib/kaapi_recorder.c
+++ b/tracelib/kaapi_recorder.c
@@ -203,14 +203,24 @@ static int _kaapi_write_header( int kid )
   for (i=0; i<kaapi_tracelib_param.fmt_listsize; ++i)
   {
     const kaapi_descrformat_t* fmt = kaapi_tracelib_param.fmt_list[i];
+    // TODO ?
+    if (fmt ==0) continue;
+    if (header.taskfmt_count >= KAAPI_FORMAT_MAX){ 
+        fprintf(stderr, "Warning: too many fmtdefs\n");
+        break;
+    }
     kaapi_fmttrace_def* fmtdef = &header.fmtdefs[header.taskfmt_count];
     fmtdef->fmtid = fmt->fmtid;
-    if (fmt->name !=0)
-      strncpy( fmtdef->name, fmt->name, 64);
+    if (fmt->name !=0){
+      strncpy( fmtdef->name, fmt->name, 63);
+      fmtdef->name[63] = 0;
+    }
     else
       strncpy( fmtdef->name, "no name", 64);
-    if (fmt->color !=0)
-      strncpy( fmtdef->color, fmt->color, 32);
+    if (fmt->color !=0){
+      strncpy( fmtdef->color, fmt->color, 31);
+      fmtdef->color[31] = 0;
+    }
     else
       strncpy( fmtdef->color, "0.0 0.0 1.0", 32);
     ++header.taskfmt_count;
diff --git a/tracelib/kaapi_trace_lib.c b/tracelib/kaapi_trace_lib.c
index 2d7f1bbbb66b5f3b68d45a5d62bf372394eba395..5082d5058b3ee0e3c36fb0e130daa7767478547d 100644
--- a/tracelib/kaapi_trace_lib.c
+++ b/tracelib/kaapi_trace_lib.c
@@ -108,6 +108,11 @@ typedef struct timespec struct_time;
 #include "kaapi_util.h"
 #include "kaapi_atomic.h"
 
+#if ((KAAPI_PERF_ID_ENDSOFTWARE+KAAPI_MAX_HWCOUNTERS) > KAAPI_PERF_ID_MAX)
+#error "The maximal size of the peformance counters handled by Kaapi should be extended. Please contact the authors."
+#endif
+
+
 #if defined(__cplusplus)
 extern "C" {
 #endif
@@ -117,6 +122,15 @@ extern "C" {
 */
 
 
+/* ------------------------------------------------------------------------------------------- */
+/* 
+   Initialization
+*/
+/* +1 each time kaapi_trace_init is called. The first call initialize the library
+*/
+static int once_init = 0;
+
+
 /* ------------------------------------------------------------------------------------------- */
 /*
   Global Variable
@@ -559,10 +573,7 @@ int kaapi_tracelib_init(
   int gid
 )
 {
-  static int once = 0;
-  if (once) return 0;
-  once = 1;
-
+  if (++once_init >1) return 0;
   int i, error;
 
   /* Update counters: undefined code */
@@ -750,15 +761,15 @@ int kaapi_tracelib_init(
 }
 
 
+
 /* ------------------------------------------------------------------------------------------- */
 /** Finish trace. Assume that threads have reach the barrier and flush
     their event buffers.
 */
+static int once_fini = 0;
 void kaapi_tracelib_fini(void)
 {
-  static int once = 0;
-  if (once) return;
-  once = 1;
+  if (++once_fini < once_init) return;
 
   FILE *file = 0;
   char buffer[8192];
@@ -778,6 +789,7 @@ void kaapi_tracelib_fini(void)
 #endif
 
   /* Display stat per task */
+#if 0 // TG: 20-05-05 disable this feature
   if (kaapi_tracelib_param.display_perfcounter != KAAPI_NO_DISPLAY_PERF)
   {
     sprintf(filename, "stat.%i", getpid());
@@ -789,6 +801,7 @@ void kaapi_tracelib_fini(void)
         1e-9*((double)kaapi_get_elapsedns_since_start())
     );
   }
+#endif
 
   if (file !=0)
   {
@@ -1072,7 +1085,7 @@ kaapi_tracelib_thread_t* kaapi_tracelib_thread_init(
     {
       if (kaapi_perf_idset_test( &ctxt->perfset, i))
       {
-printf("Attach event: '%s'\n", kaapi_perfctr_info[i].name);
+//printf("Attach event: '%s'\n", kaapi_perfctr_info[i].name);
         papi_event_codes[count++] = kaapi_perfctr_info[i].eventcode;
       }      
     }
@@ -1671,7 +1684,7 @@ static void __kaapi_dump_access(
 #else
     unsigned int numaid = 0;
 #endif
-    KAAPI_EVENT_PUSH4(kproc, 0, KAAPI_EVT_TASK_ACCESS, task, mode, addr, numaid );
+    KAAPI_EVENT_PUSH4(kproc, 0, KAAPI_EVT_TASK_ACCESS, task, mode, addr, numaid);
 
     /* how to count remote access if numa information not available ? */
     if (numaid == (unsigned int)-1) return;
@@ -1728,6 +1741,87 @@ void kaapi_tracelib_task_access(
   __kaapi_dump_access(kproc, local_numaid, task, count_noalias, deps_noalias, decoder);
 }
 
+
+
+static void __kaapi_dump_data(
+    kaapi_tracelib_thread_t*     kproc,
+    int                          local_numaid,
+    kaapi_task_id_t              task,
+    int                          count,
+    void**                       data,
+    size_t*                      size,
+    int*                         mode,
+    void                       (*decoder)(int, int*)
+)
+{
+  for (int i=0; i<count; ++i)
+  {
+    int m = KAAPI_ACCESS_MODE_VOID;
+    void* addr = data[i];
+    size_t sz = size[i];
+    decoder( mode[i], &m);
+    if (m & KAAPI_ACCESS_MODE_V)
+      continue;
+
+#if defined(__linux__) && KAAPI_USE_NUMA
+    unsigned int numaid = kaapi_numa_getpage_id( addr );
+#else
+    unsigned int numaid = 0;
+#endif
+    kaapi_event_t* evt = KAAPI_EVENT_GET(kproc, 0, KAAPI_EVT_TASK_DATA );
+    if (evt)
+    {
+      evt->u.s.d0.p      = task;
+      evt->u.s.d1.p      = addr;
+      evt->u.s.d2.i      = sz;
+      evt->u.s.d3.i32[0] = m;
+      evt->u.s.d3.i32[1] = numaid;
+      KAAPI_EVENT_PUSH(kproc, 0, KAAPI_EVT_TASK_DATA);
+    }
+  }
+}
+
+
+/*
+*/
+void kaapi_tracelib_task_data(
+    kaapi_tracelib_thread_t*     kproc,
+    kaapi_task_id_t              task,
+    int                          count,
+    void**                       data,
+    size_t*                      size,
+    int*                         mode,
+    void                        (*decoder)(int,int*)
+)
+{
+  if (!(kproc->event_mask & KAAPI_EVT_MASK(KAAPI_EVT_TASK_DATA)))
+    return;
+
+
+#if defined(__linux__)
+  int localcpu = sched_getcpu();
+#if KAAPI_USE_NUMA
+  int local_numaid = numa_node_of_cpu(localcpu);
+#else
+  int local_numaid = 0;
+#endif
+#else
+  int local_numaid = 0;
+#endif
+#if 0
+printf("In %s: count:%i, data[0]:%p, size[0], %li, mode[0]: %i\n", 
+  __func__,
+  count,
+  data[0],
+  size[0],
+  mode[0]
+);
+#endif
+  __kaapi_dump_data(kproc, local_numaid, task, count, data, size, mode, decoder);
+}
+
+
+
 /*
 */
 void kaapi_tracelib_taskwait_begin(
@@ -1827,6 +1921,7 @@ kaapi_descrformat_t* kaapi_tracelib_reserve_perfcounter(void)
   retval->perfctr   = perf;
   kaapi_tracelib_param.fmt_list[kaapi_tracelib_param.fmt_listsize] = retval;
   ++kaapi_tracelib_param.fmt_listsize;
+//printf("%s\n", __func__ );
 
   kaapi_assert(retval != 0);
   return retval;
@@ -2011,6 +2106,7 @@ static int kaapi_get_events(
       return -1;
 
 
+#if KAAPI_USE_PAPI
     /* Register PAPI counter to be at KAAPI_PERF_ID_PAPI_BASE+cnt in kaapi_perfctr_info
     */
     if (type == KAAPI_PCTR_PAPI)
@@ -2047,8 +2143,9 @@ static int kaapi_get_events(
           }
           break;
       }
-    }
-    else if (type == KAAPI_PCTR_LIBRARY)
+    } else
+#endif
+    if (type == KAAPI_PCTR_LIBRARY)
     {
       if (event_code <KAAPI_PERF_ID_MAX)
       {
diff --git a/tracelib/kaapi_trace_rt.c b/tracelib/kaapi_trace_rt.c
index 790532cd1f26f31bdfc481d3c8913244e48b698f..dc79ecfc45478a92dc868144ec5f7e67f1393966 100644
--- a/tracelib/kaapi_trace_rt.c
+++ b/tracelib/kaapi_trace_rt.c
@@ -59,8 +59,8 @@ const char* kaapi_event_name[]
 /* 3 */  "TaskEnd",
 /* 4 */  "Dependency",
 /* 5 */  "Access",
-/* 6 */  "DagCompBegin",
-/* 7 */  "DagCompEnd",
+/* 6 */  "Data",
+/* 7 */  0,
 /* 8 */  0,
 /* 9 */  0,
 /*10 */  "IdleBeg",
diff --git a/ukilli/CMakeLists.txt b/ukilli/CMakeLists.txt
index 283965bffd8faba5b5d3c4606008b7853b6664ea..aa06d436af22563aec2312b43ff36f6a71cce90f 100644
--- a/ukilli/CMakeLists.txt
+++ b/ukilli/CMakeLists.txt
@@ -8,13 +8,8 @@ set(SOURCES
   ${CMAKE_SOURCE_DIR}/poti/src/poti_header.c
   )
 
-add_custom_command (
-  OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h
-  COMMAND ${SHELL_EXECUTABLE} ${CMAKE_SOURCE_DIR}/config/hw_count.sh ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h
-  )
-
 include_directories( ${CMAKE_CURRENT_BINARY_DIR} )
 
-add_executable(ukilli ${SOURCES} ${CMAKE_CURRENT_BINARY_DIR}/hw_count.h)
+add_executable(ukilli ${SOURCES})
 target_link_libraries (ukilli tracelib)
 install(TARGETS ukilli DESTINATION bin)
diff --git a/ukilli/kaapi_trace_simulator.cpp b/ukilli/kaapi_trace_simulator.cpp
index b7c9563eb681e556ed0118a2af35eff321e74f26..8773a5848c608d3ba845ebcef9b11d64895cddd5 100644
--- a/ukilli/kaapi_trace_simulator.cpp
+++ b/ukilli/kaapi_trace_simulator.cpp
@@ -632,6 +632,9 @@ static void processor_simulate_event(
     case KAAPI_EVT_TASK_ACCESS:
     break;
 
+    case KAAPI_EVT_TASK_DATA:
+    break;
+
     case KAAPI_EVT_TASK_SUCC:
     break;
 
@@ -724,9 +727,6 @@ static void processor_simulate_event(
       ++proc->active;
     break;
 
-    case KAAPI_EVT_COMP_DAG:
-    break;
-
     /* processing request */
     case KAAPI_EVT_REQUEST_BEG:
     break;
diff --git a/ukilli/ukilli.cpp b/ukilli/ukilli.cpp
index dfbb08a89beb1a3597e0965a33e422116d38ce29..b57d3cc467fbf62e26e664dae160b2e92a4576a0 100644
--- a/ukilli/ukilli.cpp
+++ b/ukilli/ukilli.cpp
@@ -322,10 +322,13 @@ static void callback_print_event(
                 << ", numa: " << KAAPI_EVENT_DATA(event,3,i);
     break;
 
-    /* unroll graph for static schedule */
-    case KAAPI_EVT_COMP_DAG:
-      std::cout << (KAAPI_EVENT_DATA(event,0,i) == 1 ? "BEGIN":"END")
-                << ", key:" << KAAPI_EVENT_DATA(event,1,u);
+    case KAAPI_EVT_TASK_DATA:
+      std::cout << " ----------------- @:" << KAAPI_EVENT_DATA(event,0,p)
+                << " data@:" << KAAPI_EVENT_DATA(event,1,p)
+                << " size:" << KAAPI_EVENT_DATA(event,2,i)
+                << " m:" << (KAAPI_EVENT_DATA(event,3,i32)[0])
+                << "(" << kaapi_getmodename((kaapi_access_mode_t)(KAAPI_EVENT_DATA(event,3,i32)[0])) << ")"
+                << ", numa: " << KAAPI_EVENT_DATA(event,3,i32[1]);
     break;
 
     /* idle = steal state */
@@ -1029,14 +1032,6 @@ static void callback_display_paje_event(
       /* can we add time step value in the gantt ? */
     break;
 
-    /* unroll graph for static schedule */
-    case KAAPI_EVT_COMP_DAG:
-      if (KAAPI_EVENT_DATA(event,0,i) == 1)
-        kaapi_trace_poti_PushState(d0, name, "STATE", "db");
-      else
-        kaapi_trace_poti_PopState (d0, name, "STATE");
-    break;
-
     case KAAPI_EVT_PARALLEL:
       if (KAAPI_EVENT_DATA(event,0,i) == 1)
         kaapi_trace_poti_PushState(d0, name, "STATE", "pi");
@@ -1295,11 +1290,15 @@ static int fnc_paje_gantt_close(kaapi_eventfile_header_t* header, uint64_t gantt
 */
 struct access_t {
   access_t(kaapi_access_mode_t m, uint64_t i, uint64_t p, int nu)
-   : mode(m), idx(i), ptr(p), numaid(nu)
+   : mode(m), idx(i), ptr(p), size((uint64_t)-1), numaid(nu)
+  {}
+  access_t(kaapi_access_mode_t m, uint64_t i, uint64_t p, uint64_t s, int nu)
+   : mode(m), idx(i), ptr(p), size(s), numaid(nu)
   {}
   kaapi_access_mode_t mode;
   uint64_t            idx;
   uint64_t            ptr;
+  uint64_t            size;
   int                 numaid;
 };
 
@@ -1330,7 +1329,7 @@ struct state_t : public event_t {
 /* */
 struct task_info : public state_t {
   task_info( )
-    : state_t(), ct(0), kid(0), fmtid(0), param(), perfctr(), pred(), succ()
+    : state_t(), ct(0), kid(0), fmtid(0), param(), data(), perfctr(), pred(), succ()
   {
     static uint64_t cnt_task = 0;
     keys[0] = keys[1] = 0;
@@ -1348,6 +1347,7 @@ struct task_info : public state_t {
   uint64_t               aff_tag;
   uint64_t               keys[2];
   std::vector<access_t>  param;
+  std::vector<access_t>  data;
   std::vector<perfctr_t> perfctr;
   std::vector<task_info*>pred;
   std::vector<task_info*>succ;
@@ -1776,7 +1776,7 @@ static void callback_display_rastello(
         break;
       }
 
-//printf("%" PRIu64 " Task: %p fmtid: %i\n", event->date, KAAPI_EVENT_DATA(event,0,p), (int)KAAPI_EVENT_DATA(event,1,u));
+//printf("%" PRIu64 " Implicit Task: %p fmtid: %i\n", event->date, KAAPI_EVENT_DATA(event,0,p), (int)KAAPI_EVENT_DATA(event,1,u));
       std::map<uint64_t,kproc_t>::iterator kp = parallel_region_t::container_kproc.find( event->kid );
       if (kp == parallel_region_t::container_kproc.end())
       {
@@ -1846,6 +1846,39 @@ printf("%i:: task[id:%i/%p] = {mode:%c}/%p\n", kid, task->id, (void*)task->addr,
     } break;
 
 
+    case KAAPI_EVT_TASK_DATA:
+    {
+      uint64_t ptask = KAAPI_EVENT_DATA(event,0,u);
+      uint64_t paddr = KAAPI_EVENT_DATA(event,1,u);
+      size_t   size = KAAPI_EVENT_DATA(event,2,u);
+      int      mode  = KAAPI_EVENT_DATA(event,3,i32)[0];
+      int      numaid = (int)KAAPI_EVENT_DATA(event,3,i32)[1];
+
+      uint64_t idx;
+      if (rpr->container_implicit_task.find( ptask) != rpr->container_implicit_task.end())
+        break;
+      task_info* task = rpr->get_taskinfo( ptask, false );
+      if (task ==0)
+      {
+        printf("*** Event task_access does not correspond to event task_begin: task:%p\n", (void*)ptask );
+        break;
+      }
+
+      std::map<uint64_t,data_t>::iterator data = rpr->container_data.find( paddr );
+      if (data == rpr->container_data.end())
+      {
+        idx = rpr->cnt_iddata;
+        ++rpr->cnt_iddata;
+        rpr->container_data.insert( std::make_pair( paddr, data_t(idx, numaid) ) );
+      }
+      else
+        idx = (int)data->second.cnt_iddata;
+
+      task->data.push_back( access_t((kaapi_access_mode_t)mode, idx, paddr, size, numaid) );
+    }
+    break;
+
+
     case KAAPI_EVT_TASK_PERFCOUNTER:
     {
       uint64_t addr = KAAPI_EVENT_DATA(event,0,u);
@@ -1877,16 +1910,13 @@ printf("%i:: task[id:%i/%p] = {mode:%c}/%p\n", kid, task->id, (void*)task->addr,
 
     case KAAPI_EVT_TASK_SUCC:
     {
-      task_info* task = rpr->get_taskinfo( KAAPI_EVENT_DATA(event,0,u), false );
-      if (task != 0)/* should be top stack task */
-      {
-        task_info* succ = rpr->get_taskinfo( KAAPI_EVENT_DATA(event,1,u), true );
+      task_info* task = rpr->get_taskinfo( KAAPI_EVENT_DATA(event,0,u), true );
+      task_info* succ = rpr->get_taskinfo( KAAPI_EVENT_DATA(event,1,u), true );
 #if DEBUG_RAST
 printf("pred[id:%i/%p] = {id:%i}/%p\n", succ->id, (void*)succ->addr, task->id, (void*)task->addr);
 #endif
-        succ->pred.push_back( task );
-        task->succ.push_back( succ );
-      }
+      succ->pred.push_back( task );
+      task->succ.push_back( succ );
     } break;
 
 
@@ -1917,7 +1947,11 @@ printf("pred[id:%i/%p] = {id:%i}/%p\n", succ->id, (void*)succ->addr, task->id, (
       {
         task->stop = event->date;
         task->numaid = (uint32_t)KAAPI_EVENT_DATA(event,1,i);
+#if DEBUG_RAST
+        printf("%i:: task end:id:%i/%p\n", kid, task->id, (void*)task->addr);
+#endif
       }
+      /* should be top ? */
       if (kp == parallel_region_t::container_kproc.end())
         printf("***[%d] Unkown thread kid: %i\n", __LINE__, event->kid);
 /*
@@ -1997,10 +2031,6 @@ printf("pred[id:%i/%p] = {id:%i}/%p\n", succ->id, (void*)succ->addr, task->id, (
     case KAAPI_EVT_PERFCOUNTER:
       break;
 
-    /* unroll graph for static schedule */
-    case KAAPI_EVT_COMP_DAG:
-    break;
-
     case KAAPI_EVT_TASKWAIT :
     case KAAPI_EVT_TASKGROUP:
     break;
@@ -2105,7 +2135,7 @@ printf("pred[id:%i/%p] = {id:%i}/%p\n", succ->id, (void*)succ->addr, task->id, (
     } break;
 
     default:
-      printf("***Unknown event number: %i\n", event->evtno);
+      printf("***(Rast)Unknown event number: %i\n", event->evtno);
       break;
   }
 }
@@ -2277,6 +2307,8 @@ int dot_parallel_region_t::openfile(kaapi_eventfile_header_t* header)
   return 0;
 }
 
+
+/* */
 int dot_parallel_region_t::closefile(int cpucount)
 {
   if (fout == 0) return 1;
@@ -2840,22 +2872,6 @@ void csv_parallel_region_t::dump_task_info(  task_info* ti )
             ti->perfctr[i].value );
     }
   }
-#if 0 /* not yet in for csv */
-  for (size_t i=0; i< ti->param.size(); ++i)
-    switch (ti->param[i].mode)
-    {
-      case KAAPI_ACCESS_MODE_R:
-        fprintf(fout,"\taction_add_read(T, &blocks[id(\"A\",{%" PRIu64 "})]);\n", ti->param[i].idx);
-        break;
-      case KAAPI_ACCESS_MODE_W:
-        fprintf(fout,"\taction_add_write(T, &blocks[id(\"A\",{%" PRIu64 "})]);\n", ti->param[i].idx);
-        break;
-      case KAAPI_ACCESS_MODE_RW:
-        fprintf(fout,"\taction_add_readwrite(T, &blocks[id(\"A\",{%" PRIu64 "})]);\n", ti->param[i].idx);
-        break;
-      default: break;
-    }
-#endif
   fprintf(fout, "\n" );
 }
 
@@ -2943,6 +2959,235 @@ static void fnc_csv( int count, const char** filenames )
 }
 
 
+/* ============================= SOMP OUTPUT */
+/* SOMP: trace format
+*/
+struct somp_parallel_region_t : public parallel_region_t {
+  somp_parallel_region_t(uintptr_t pid)
+    : parallel_region_t(pid)
+  {}
+
+  int openfile(kaapi_eventfile_header_t* header);
+  int closefile(int cpucount);
+  void dump_task_info( task_info* ti );
+private:
+};
+
+class somp_parallel_region_factory_t {
+public:
+  virtual parallel_region_t* create(uintptr_t);
+};
+somp_parallel_region_factory_t somp_pr_factory;
+
+parallel_region_t* somp_parallel_region_factory_t::create(uintptr_t prid)
+{
+  return new somp_parallel_region_t(prid);
+}
+
+
+int somp_parallel_region_t::openfile(kaapi_eventfile_header_t* header)
+{
+  FILE* file = 0;
+  sprintf(filename, "trace_%i.rec", (int)parallel_id);
+  fout = file = fopen(filename,"w");
+  if (file ==0)
+  {
+    fprintf(stderr,"*** Cannot open file '%s'\n",filename);
+    exit(-1);
+  }
+
+  for (int cnt=0; cnt<header->taskfmt_count; ++cnt)
+    if (header->fmtdefs[cnt].fmtid !=0)
+    {
+      rastello_fmtname.insert( std::make_pair(header->fmtdefs[cnt].fmtid, header->fmtdefs[cnt].name) );
+//  fprintf(stdout, "insert fmtid: %" PRIu64 " -> name: %s\n", header->fmtdefs[cnt].fmtid, header->fmtdefs[cnt].name);
+    }
+
+  return 0;
+}
+
+/* */
+int somp_parallel_region_t::closefile(int cpucount)
+{
+  if (fout == 0) return 1;
+  std::list<task_info*>::iterator ibeg = container_orderedlist.begin();
+  std::list<task_info*>::iterator iend = container_orderedlist.end();
+  /* reset container_data used in dump_task_info to attach version number to data */
+  container_data.clear();
+  while (ibeg != iend)
+  {
+    dump_task_info(*ibeg);
+    ++ibeg;
+  }
+
+  fprintf(stdout,"*** File '%s' generated\n", filename);
+  fclose(fout);
+  fout = 0;
+
+  //rastello_fmtname.clear();
+  container_data.clear();
+  return 0;
+}
+
+
+/* Kind of format
+  Name*: dgemm
+  JobId: 11
+  SubmitOrder: 3
+  SubmitTime: 0.1
+  StartTime*: 0.2
+  EndTime*: 0.3
+  MemoryNode*: 0
+  Handles*: 2df4ce5 3dve4a4 1dvgf5d
+  Modes*: R R RW
+  Sizes: 2000x2000x8 2000x2000x8 2000x2000x8
+  MData: <list of addresses>
+  MSize: <list of size (Bytes)>
+  MMode: <list of modes (R|W|RW) only>
+  DependsOn*: 5
+*/
+void somp_parallel_region_t::dump_task_info( task_info* ti )
+{
+  /* task */
+  std::map<uint64_t, char*>::iterator fmt =rastello_fmtname.find(ti->fmtid);
+
+  if (fmt == rastello_fmtname.end())
+    fprintf(stdout, "cannot find fmtid: %" PRIu64 "\n", ti->fmtid);
+
+  fprintf(fout,"Name: %s\n"
+               "JobId: %li\n"
+               "StartTime: %f\n"
+               "EndTime: %f\n"
+               "MemoryNode: %li\n",
+    (fmt == rastello_fmtname.end() ? "<unknown>" : fmt->second ),
+    ti->id,
+    (double)ti->start*1e-6 /* ns -> millis */,
+    (double)ti->stop*1e-6 /* ns -> millis */,
+    ti->numaid
+  );
+
+  /* handles */
+  fprintf(fout,"Handles:");
+  for (int i=0; i<ti->param.size(); ++i)
+  {
+    uint64_t            ptr    = ti->param[i].ptr;
+    fprintf(fout," %p", ptr);
+  }
+  fprintf(fout,"\n");
+
+  /* Modes */
+  fprintf(fout,"Modes:");
+  for (int i=0; i<ti->param.size(); ++i)
+  {
+    kaapi_access_mode_t mode   = ti->param[i].mode;
+    if (KAAPI_ACCESS_IS_READWRITE(mode))
+      fprintf(fout," RW");
+    else if (KAAPI_ACCESS_IS_READ(mode))
+      fprintf(fout," R");
+    else if (KAAPI_ACCESS_IS_WRITE(mode))
+      fprintf(fout," W");
+    else if (mode == (KAAPI_ACCESS_MODE_C|KAAPI_ACCESS_MODE_CW))
+      fprintf(fout," MX");
+    else if (mode & KAAPI_ACCESS_MODE_SCRATCH)
+      fprintf(fout," S");
+  }
+  fprintf(fout,"\n");
+
+  /* Modes */
+  fprintf(fout,"Sizes:");
+  fprintf(fout,"\n");
+
+  /* MData */
+  if (ti->data.size() >0)
+  {
+    fprintf(fout,"MData:");
+    for (int i=0; i<ti->data.size(); ++i)
+    {
+      uint64_t            ptr    = ti->data[i].ptr;
+      fprintf(fout," %p", ptr);
+    }
+    fprintf(fout,"\n");
+  
+    /* MSize */
+    fprintf(fout,"MSize:");
+    for (int i=0; i<ti->data.size(); ++i)
+    {
+      uint64_t            ptr    = ti->data[i].size;
+      fprintf(fout," %li", ptr);
+    }
+    fprintf(fout,"\n");
+  
+    /* MMode */
+    fprintf(fout,"MMode:");
+    for (int i=0; i<ti->data.size(); ++i)
+    {
+      kaapi_access_mode_t mode   = ti->data[i].mode;
+      if (KAAPI_ACCESS_IS_READWRITE(mode))
+        fprintf(fout," RW");
+      else if (KAAPI_ACCESS_IS_READ(mode))
+        fprintf(fout," R");
+      else if (KAAPI_ACCESS_IS_WRITE(mode))
+        fprintf(fout," W");
+      else if (mode & KAAPI_ACCESS_MODE_SCRATCH)
+        fprintf(fout," S");
+      else 
+        fprintf(stderr," Invalid mode ");
+    }
+    fprintf(fout,"\n");
+  }
+    
+  /* DependsOn */
+  fprintf(fout,"DependsOn:");
+  for (int i=0; i<ti->pred.size(); ++i)
+  {
+    fprintf(fout," %li", ti->pred[i]->id);
+  }
+  fprintf(fout,"\n\n");
+}
+
+
+
+/* reuse rastello data structure
+*/
+static void fnc_somp( int count, const char** filenames )
+{
+  rastello_parallel_region.reserve(128);
+  rastello_front_parallel_region.reserve(256);
+  rastello_fs = OpenFiles( count, filenames );
+  if (rastello_fs ==0)
+    return;
+  if (GetHeader(rastello_fs, &rastello_header) !=0)
+    return;
+
+  /* generate dot graph: one per parallel
+     region if katracereader_options.dotoption & DOT_OPTIONS_CREGION
+  */
+  ReadFiles(rastello_fs, &somp_pr_factory, callback_display_rastello );
+
+  for (size_t i = 0; i<rastello_parallel_region.size(); ++i)
+  {
+    somp_parallel_region_t* dotpr = (somp_parallel_region_t*)rastello_parallel_region[i];
+    if (dotpr ==0) continue;
+    if (dotpr->openfile( &rastello_header) !=0)
+    {
+      exit(1);
+    }
+    if (dotpr->closefile(rastello_parallel_region[i]->nproc) !=0)
+    {
+      exit(1);
+    }
+  }
+
+//  if (katracereader_options.dotoption & DOT_OPTIONS_CREGION )
+//  {
+//    rastello_parallel_region[0]->closefile( GetProcessorCount(rastello_fs) );
+//  }
+
+  /* close & umap */
+  CloseFiles(rastello_fs);
+}
+
+
 
 /*
 */
@@ -2967,8 +3212,9 @@ static void print_usage(const char* msg = 0)
   fprintf(stderr, "    --dot-nolabel      : do not output label.\n");
   fprintf(stderr, "    --dot-cregion      : output graph accross parallel regions.\n");
 //  fprintf(stderr, "     --dot-nodata : do not output data node.\n");
-  fprintf(stderr, "  -r | --rastello      : output Rastello format compatible with CORSE team simulator.\n");
-  fprintf(stderr, "                         Output filename is rastello_<n>.c, one per parallel region.\n");
+  fprintf(stderr, "  -s | --somp          : output file with SOMP trace format .\n");
+//  fprintf(stderr, "  -r | --rastello      : output Rastello format compatible with CORSE team simulator.\n");
+//  fprintf(stderr, "                         Output filename is rastello_<n>.c, one per parallel region.\n");
 //  fprintf(stderr, "  --steal-event   : include steal events in trace.\n");
 //  fprintf(stderr, "  --gpu-trace     : include GPU trace information.\n");
 //  fprintf(stderr, "  --gpu-transfer  : include GPU transfers.\n");
@@ -3005,6 +3251,8 @@ static kaapi_fnc_event parse_option( const int argc, const char** argv, int* cou
       option = 'a';
     else if ((strcmp(argv[i], "--display-header") ==0)||(strcmp(argv[i], "-e") ==0))
       option = 'h';
+    else if ((strcmp(argv[i], "--somp") ==0) || (strcmp(argv[i], "-s") ==0))
+      option = 'o';
     else if ((strcmp(argv[i], "--rastello") ==0) || (strcmp(argv[i], "-r") ==0))
       option = 'r';
     else if ((strcmp(argv[i], "--csv") ==0) || (strcmp(argv[i], "-c") ==0))
@@ -3123,6 +3371,9 @@ static kaapi_fnc_event parse_option( const int argc, const char** argv, int* cou
   case 'd':
     return fnc_dot;
 
+  case 'o':
+    return fnc_somp;
+
   case 'H':
   default:
     print_usage();