From 250c261e9e502cf179a1c468ba10e46d53404e9d Mon Sep 17 00:00:00 2001
From: PRUVOST Florent <florent.pruvost@inria.fr>
Date: Fri, 13 Dec 2019 15:10:42 +0100
Subject: [PATCH] Remove the FXT starpu option and the enable tracaing. It is
 now done by eztrace, or directly with STARPU_USE_FXT

---
 CMakeLists.txt                                |  37 ---
 compute/CMakeLists.txt                        |   5 -
 control/context.c                             |   2 +
 coreblas/eztrace_module/CMakeLists.txt        |  61 ++---
 ...ce_module => coreblas_core_eztrace_module} |   4 +-
 .../coreblas_tcore_eztrace_module             | 224 ++++++++++++++++++
 doc/orgmode/chapters/installing.org           |  30 ++-
 doc/orgmode/chapters/using.org                |  79 +++++-
 example/link_chameleon/CMakeLists.txt         |   2 +-
 include/chameleon/config.h.in                 |   3 -
 new-testing/testing_zauxiliary.c              |  46 +++-
 new-testing/testing_zauxiliary.h              |  58 -----
 new-testing/testing_zgetrf.c                  |   2 -
 runtime/openmp/control/runtime_profiling.c    |   2 -
 runtime/parsec/control/runtime_profiling.c    |   2 -
 runtime/quark/control/runtime_profiling.c     |   2 -
 runtime/starpu/control/runtime_control.c      |   4 +-
 runtime/starpu/control/runtime_profiling.c    |  10 +-
 runtime/starpu/include/chameleon_starpu.h.in  |   1 -
 19 files changed, 386 insertions(+), 188 deletions(-)
 rename coreblas/eztrace_module/{coreblas_eztrace_module => coreblas_core_eztrace_module} (99%)
 create mode 100644 coreblas/eztrace_module/coreblas_tcore_eztrace_module

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 33bce25ca..27a355f8b 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -184,13 +184,6 @@ if (CHAMELEON_ENABLE_CUDA AND NOT CHAMELEON_USE_CUDA)
     message("-- ${BoldGreen}CHAMELEON_USE_CUDA is set to OFF, turn it ON to use CUDA (unsupported by Quark)${ColourReset}")
 endif()
 
-# Enable FXT if StarPU
-option(CHAMELEON_ENABLE_TRACING "Enable tracing support" OFF)
-if (NOT CHAMELEON_ENABLE_TRACING)
-    message("-- ${BoldGreen}CHAMELEON_ENABLE_TRACING is set to OFF, turn it ON to use FxT (with StarPU)${ColourReset}")
-endif()
-#option(CHAMELEON_USE_EZTRACE "Enable EZTRACE to build modules" OFF)
-
 option(CHAMELEON_RUNTIME_SYNC "Enable synchronous task submission when available to debug the code without parallelism" OFF)
 if (CHAMELEON_RUNTIME_SYNC)
     message("-- ${BoldGreen}CHAMELEON_RUNTIME_SYNC is set to ON, turn it OFF to avoid synchronisation in the tasks submission${ColourReset}")
@@ -605,9 +598,6 @@ if( CHAMELEON_SCHED_STARPU )
     if(CHAMELEON_USE_MPI)
         list(APPEND STARPU_COMPONENT_LIST "MPI")
     endif()
-    if(CHAMELEON_ENABLE_TRACING)
-        list(APPEND STARPU_COMPONENT_LIST "FXT")
-    endif()
 
     find_package(STARPU ${CHAMELEON_STARPU_VERSION} REQUIRED
                  COMPONENTS ${STARPU_COMPONENT_LIST})
@@ -673,18 +663,6 @@ if( CHAMELEON_SCHED_STARPU )
             set(CHAMELEON_USE_MIGRATE "OFF")
             message("-- ${Blue}CHAMELEON_USE_MIGRATE is turned OFF because starpu_mpi_data_migrate not found${ColourReset}")
         endif()
-        if(CHAMELEON_ENABLE_TRACING)
-            # check if fxt profiling is accessible in starpu and activate it in chameleon
-            check_function_exists(starpu_fxt_start_profiling HAVE_STARPU_FXT_PROFILING)
-            if ( HAVE_STARPU_FXT_PROFILING )
-                message("-- ${Blue}Add definition HAVE_STARPU_FXT_PROFILING"
-                " - Activate FxT profiling through StarPU${ColourReset}")
-            else()
-                message("-- ${Red}Looking for starpu with fxt"
-                " - starpu_fxt_start_profiling() test fails in StarPU${ColourReset}")
-                message("-- ${Red}Check in CMakeFiles/CMakeError.log to figure out why it fails${ColourReset}")
-            endif()
-        endif()
         if (CHAMELEON_USE_MPI)
             # Check if a specific function exist
             check_function_exists(starpu_mpi_data_register_comm HAVE_STARPU_MPI_DATA_REGISTER)
@@ -716,14 +694,6 @@ if( CHAMELEON_SCHED_STARPU )
                     Print_Find_Library_Status(hwloc libhwloc)
                 endif ()
             endif()
-            if(CHAMELEON_ENABLE_TRACING AND (NOT FXT_FOUND OR NOT FXT_LIBRARIES))
-                if (NOT FXT_fxt.h_DIRS)
-                    Print_Find_Header_Status(fxt fxt.h)
-                endif ()
-                if (NOT FXT_fxt_LIBRARY)
-                    Print_Find_Library_Status(fxt libfxt)
-                endif ()
-            endif()
             if(CHAMELEON_SIMULATION AND (NOT SIMGRID_FOUND OR NOT SIMGRID_LIBRARIES))
                 if (NOT SIMGRID_simgrid.h_DIRS)
                     Print_Find_Header_Status(simgrid simgrid.h)
@@ -758,9 +728,6 @@ if( CHAMELEON_SCHED_STARPU )
         if(CHAMELEON_SIMULATION AND (NOT SIMGRID_FOUND OR NOT SIMGRID_LIBRARIES))
             message(FATAL_ERROR "SimGrid library is required but has not been found")
         endif()
-        if(CHAMELEON_ENABLE_TRACING AND (NOT FXT_FOUND OR NOT FXT_LIBRARIES))
-            message(FATAL_ERROR "FxT library is required but has not been found")
-        endif()
         if( (NOT STARPU_SHM_FOUND) OR (NOT STARPU_SHM_LIBRARIES) OR
             ( STARPU_LOOK_FOR_MPI AND (NOT STARPU_MPI_FOUND OR NOT STARPU_MPI_LIBRARIES) )
           )
@@ -780,10 +747,6 @@ if( CHAMELEON_SCHED_PARSEC )
     if(CHAMELEON_USE_CUDA)
         list(APPEND PARSEC_COMPONENT_LIST "CUDA")
     endif()
-    # TODO: Add a CHAMELEON_WITH_PROFILING option that enables Fxt for StarPU, or PAPI for PaRSEC
-    #if(CHAMELEON_WITH_PROFILING)
-    #    list(APPEND PARSEC_COMPONENT_LIST "PAPI")
-    #endif()
     find_package(PARSEC COMPONENTS ${PARSEC_COMPONENT_LIST})
     if(PARSEC_FOUND)
         message("-- ${Blue}Add definition CHAMELEON_SCHED_PARSEC"
diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt
index 3ca2aaaa1..b49870887 100644
--- a/compute/CMakeLists.txt
+++ b/compute/CMakeLists.txt
@@ -304,11 +304,6 @@ elseif(CHAMELEON_SCHED_QUARK)
 elseif(CHAMELEON_SCHED_OPENMP)
   target_link_libraries(chameleon chameleon_openmp)
 endif()
-if (NOT CHAMELEON_SIMULATION)
-  # Depends on coreblas only for set_coreblas_gemm3m_enabled() (Maybe we should change that)
-  add_dependencies(chameleon coreblas_include)
-  target_link_libraries(chameleon coreblas)
-endif()
 target_link_libraries(chameleon hqr)
 
 add_dependencies(chameleon
diff --git a/control/context.c b/control/context.c
index 4e0a6672b..759716782 100644
--- a/control/context.c
+++ b/control/context.c
@@ -148,6 +148,7 @@ int CHAMELEON_Enable(int option)
             break;
         case CHAMELEON_PROFILING_MODE:
             chamctxt->profiling_enabled = CHAMELEON_TRUE;
+            RUNTIME_start_profiling();
             break;
         case CHAMELEON_PROGRESS:
             chamctxt->progress_enabled = CHAMELEON_TRUE;
@@ -218,6 +219,7 @@ int CHAMELEON_Disable(int option)
             break;
         case CHAMELEON_PROFILING_MODE:
             chamctxt->profiling_enabled = CHAMELEON_FALSE;
+            RUNTIME_stop_profiling();
             break;
         case CHAMELEON_PROGRESS:
             chamctxt->progress_enabled = CHAMELEON_FALSE;
diff --git a/coreblas/eztrace_module/CMakeLists.txt b/coreblas/eztrace_module/CMakeLists.txt
index 810bab667..2850ca003 100644
--- a/coreblas/eztrace_module/CMakeLists.txt
+++ b/coreblas/eztrace_module/CMakeLists.txt
@@ -39,36 +39,39 @@ if (EZTRACE_FOUND AND EZTRACE_DIR_FOUND)
 
         set(EZTRACE_CREATE_PLUGIN "${EZTRACE_eztrace_create_plugin_DIR}/eztrace_create_plugin")
 
-        add_custom_command(
-            OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/output
-            COMMAND ${EZTRACE_CREATE_PLUGIN}
-            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_eztrace_module
-            DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_eztrace_module
-            )
-        add_custom_target(
-            eztrace-module-chameleon_core-dir ALL
-            DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output
-            )
-        add_custom_command(
-            OUTPUT libeztrace-convert-chameleon_core.so
-            COMMAND make
-            WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/output
-            DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output
-            )
-        add_custom_target(
-            eztrace-module-chameleon_core-libs ALL
-            DEPENDS libeztrace-convert-chameleon_core.so
-            )
-        # installation
-        # ------------
-        install(
-            FILES
-            ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-autostart-chameleon_core.so
-            ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-chameleon_core.so
-            ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-convert-chameleon_core.so
-            DESTINATION ${EZTRACE_LIBRARY_DIRS}
-            )
+        set( COREBLAS_EZTRACE_MODULES core tcore )
 
+        foreach( _module ${COREBLAS_EZTRACE_MODULES} )
+            add_custom_command(
+                OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_module}
+                COMMAND ${EZTRACE_CREATE_PLUGIN} -o ${CMAKE_CURRENT_BINARY_DIR}/${_module} > /dev/null
+                ARGS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_${_module}_eztrace_module
+                DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_${_module}_eztrace_module
+                )
+            add_custom_target(
+                eztrace-module-chameleon_${_module}-dir ALL
+                DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${_module}
+                )
+            add_custom_command(
+                OUTPUT libeztrace-convert-chameleon_${_module}.so
+                COMMAND make
+                WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${_module}
+                DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${_module}
+                )
+            add_custom_target(
+                eztrace-module-chameleon_${_module}-libs ALL
+                DEPENDS libeztrace-convert-chameleon_${_module}.so
+                )
+            # installation
+            # ------------
+            install(
+                FILES
+                ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-autostart-chameleon_${_module}.so
+                ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-chameleon_${_module}.so
+                ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-convert-chameleon_${_module}.so
+                DESTINATION ${EZTRACE_LIBRARY_DIRS}
+                )
+        endforeach()
     endif (EZTRACE_eztrace_create_plugin_DIR)
 
 endif (EZTRACE_FOUND AND EZTRACE_DIR_FOUND)
diff --git a/coreblas/eztrace_module/coreblas_eztrace_module b/coreblas/eztrace_module/coreblas_core_eztrace_module
similarity index 99%
rename from coreblas/eztrace_module/coreblas_eztrace_module
rename to coreblas/eztrace_module/coreblas_core_eztrace_module
index c93f7a3a4..a9e517548 100644
--- a/coreblas/eztrace_module/coreblas_eztrace_module
+++ b/coreblas/eztrace_module/coreblas_core_eztrace_module
@@ -1,9 +1,7 @@
 BEGIN_MODULE
 NAME chameleon_core
 DESC "Module for Chameleon CORE functions"
-ID 7770
-
-#include <chameleon.h>
+ID 0
 
 void CORE_scasum(int storev, int uplo, int M, int N,
                  void *A, int lda, float *work);
diff --git a/coreblas/eztrace_module/coreblas_tcore_eztrace_module b/coreblas/eztrace_module/coreblas_tcore_eztrace_module
new file mode 100644
index 000000000..18698b8a1
--- /dev/null
+++ b/coreblas/eztrace_module/coreblas_tcore_eztrace_module
@@ -0,0 +1,224 @@
+BEGIN_MODULE
+NAME chameleon_tcore
+DESC "Module for Chameleon TCORE functions"
+ID 1
+
+
+void TCORE_sasum( int storev, int uplo, int M, int N, const void *A, float *work );
+int  TCORE_saxpy( int M, void *alpha, const void *A, int incA, void *B, int incB );
+int  TCORE_sgeadd( int trans, int M, int N, void *alpha, const void *A, float beta, void *B );
+int  TCORE_sgelqt( int M, int N, int IB, void *A, void *T, float *TAU, float *WORK );
+void TCORE_sgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, float beta, void *C );
+int  TCORE_sgeqrt( int M, int N, int IB, void *A, void *T, float *TAU, float *WORK );
+int  TCORE_sgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A );
+int  TCORE_sgessq( int storev, int M, int N, const void *A, void *sclssq );
+int  TCORE_sgetrf( int M, int N, void *A, int *IPIV, int *INFO );
+int  TCORE_sgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO );
+int  TCORE_sgetrf_nopiv( int M, int N, int IB, void *A, int *INFO );
+void TCORE_she2ge( int uplo, int M, int N, const void *A, void *B );
+int  TCORE_ssyrfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, float *WORK, int ldwork );
+void TCORE_slacpy( int uplo, int M, int N, const void *A, void *B );
+void TCORE_slange( int norm, int M, int N, const void *A, float *work, float *normA );
+void TCORE_slansy( int norm, int uplo, int N, const void *A, float *work, float *normA );
+void TCORE_slantr( int norm, int uplo, int diag, int M, int N, const void *A, float *work, float *normA );
+int  TCORE_slascal( int uplo, int m, int n, void *alpha, void *A );
+void TCORE_slaset( int uplo, int n1, int n2, void *alpha, float beta, void *A );
+void TCORE_slaset2( int uplo, int n1, int n2, void *alpha, void *A );
+int  TCORE_slatro( int uplo, int trans, int M, int N, const void *A, void *B );
+void TCORE_slauum( int uplo, int N, void *A );
+void TCORE_splgsy( float bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_splrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_spotrf( int uplo, int n, void *A, int *INFO );
+int  TCORE_sssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV );
+void TCORE_ssymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, float beta, void *C );
+void TCORE_ssyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, float beta, void *C );
+void TCORE_ssyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, float beta, void *C );
+int  TCORE_ssyssq( int storev, int uplo, int N, const void *A, void *sclssq );
+int  TCORE_ssytf2_nopiv( int uplo, int n, void *A );
+int  TCORE_stplqt( int M, int N, int L, int IB, void *A, void *B, void *T, float *WORK );
+int  TCORE_stpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, float *WORK );
+int  TCORE_stpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, float *WORK );
+int  TCORE_stpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, float *WORK );
+int  TCORE_stradd( int uplo, int trans, int M, int N, void *alpha, const void *A, float beta, void *B );
+void TCORE_strasm( int storev, int uplo, int diag, int M, int N, const void *A, float *work );
+void TCORE_strmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+void TCORE_strsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+int  TCORE_strssq( int uplo, int diag, int M, int N, const void *A, void *sclssq );
+void TCORE_strtri( int uplo, int diag, int N, void *A, int *info );
+int  TCORE_stsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, float *WORK, int ldwork );
+int  TCORE_stsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, float *WORK, int ldwork );
+int  TCORE_ststrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, float *WORK, int LDWORK, int *INFO );
+int  TCORE_sormlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, float *WORK, int LDWORK );
+int  TCORE_sormqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, float *WORK, int LDWORK );
+int TCORE_sgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A );
+
+void TCORE_dasum( int storev, int uplo, int M, int N, const void *A, double *work );
+int  TCORE_daxpy( int M, void *alpha, const void *A, int incA, void *B, int incB );
+int  TCORE_dgeadd( int trans, int M, int N, void *alpha, const void *A, double beta, void *B );
+int  TCORE_dgelqt( int M, int N, int IB, void *A, void *T, double *TAU, double *WORK );
+void TCORE_dgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C );
+int  TCORE_dgeqrt( int M, int N, int IB, void *A, void *T, double *TAU, double *WORK );
+int  TCORE_dgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A );
+int  TCORE_dgessq( int storev, int M, int N, const void *A, void *sclssq );
+int  TCORE_dgetrf( int M, int N, void *A, int *IPIV, int *INFO );
+int  TCORE_dgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO );
+int  TCORE_dgetrf_nopiv( int M, int N, int IB, void *A, int *INFO );
+void TCORE_dhe2ge( int uplo, int M, int N, const void *A, void *B );
+int  TCORE_dsyrfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, double *WORK, int ldwork );
+void TCORE_dlacpy( int uplo, int M, int N, const void *A, void *B );
+void TCORE_dlange( int norm, int M, int N, const void *A, double *work, double *normA );
+void TCORE_dlansy( int norm, int uplo, int N, const void *A, double *work, double *normA );
+void TCORE_dlantr( int norm, int uplo, int diag, int M, int N, const void *A, double *work, double *normA );
+int  TCORE_dlascal( int uplo, int m, int n, void *alpha, void *A );
+void TCORE_dlaset( int uplo, int n1, int n2, void *alpha, double beta, void *A );
+void TCORE_dlaset2( int uplo, int n1, int n2, void *alpha, void *A );
+int  TCORE_dlatro( int uplo, int trans, int M, int N, const void *A, void *B );
+void TCORE_dlauum( int uplo, int N, void *A );
+void TCORE_dplgsy( double bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_dplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_dpotrf( int uplo, int n, void *A, int *INFO );
+int  TCORE_dssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV );
+void TCORE_dsymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, double beta, void *C );
+void TCORE_dsyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, double beta, void *C );
+void TCORE_dsyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C );
+int  TCORE_dsyssq( int storev, int uplo, int N, const void *A, void *sclssq );
+int  TCORE_dsytf2_nopiv( int uplo, int n, void *A );
+int  TCORE_dtplqt( int M, int N, int L, int IB, void *A, void *B, void *T, double *WORK );
+int  TCORE_dtpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, double *WORK );
+int  TCORE_dtpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, double *WORK );
+int  TCORE_dtpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, double *WORK );
+int  TCORE_dtradd( int uplo, int trans, int M, int N, void *alpha, const void *A, double beta, void *B );
+void TCORE_dtrasm( int storev, int uplo, int diag, int M, int N, const void *A, double *work );
+void TCORE_dtrmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+void TCORE_dtrsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+int  TCORE_dtrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq );
+void TCORE_dtrtri( int uplo, int diag, int N, void *A, int *info );
+int  TCORE_dtsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, double *WORK, int ldwork );
+int  TCORE_dtsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, double *WORK, int ldwork );
+int  TCORE_dtstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, double *WORK, int LDWORK, int *INFO );
+int  TCORE_dormlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, double *WORK, int LDWORK );
+int  TCORE_dormqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, double *WORK, int LDWORK );
+int TCORE_dgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A );
+
+void TCORE_scasum( int storev, int uplo, int M, int N, const void *A, float *work );
+int  TCORE_caxpy( int M, void * alpha, const void *A, int incA, void *B, int incB );
+int  TCORE_cgeadd( int trans, int M, int N, void * alpha, const void *A, void * beta, void *B );
+int  TCORE_cgelqt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK );
+void TCORE_cgemm( int transA, int transB, int M, int N, int K, void * alpha, const void *A, const void *B, void * beta, void *C );
+int  TCORE_cgeqrt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK );
+int  TCORE_cgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A );
+int  TCORE_cgessq( int storev, int M, int N, const void *A, void *sclssq );
+int  TCORE_cgetrf( int M, int N, void *A, int *IPIV, int *INFO );
+int  TCORE_cgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO );
+int  TCORE_cgetrf_nopiv( int M, int N, int IB, void *A, int *INFO );
+void TCORE_che2ge( int uplo, int M, int N, const void *A, void *B );
+void TCORE_chemm( int side, int uplo, int M, int N, void * alpha, const void *A, const void *B, void * beta, void *C );
+void TCORE_cherk( int uplo, int trans, int N, int K, void *alpha, const void *A, float beta, void *C );
+void TCORE_cher2k( int uplo, int trans, int N, int K, void * alpha, const void *A, const void *B, float beta, void *C );
+int  TCORE_cherfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, void *WORK, int ldwork );
+int  TCORE_chessq( int storev, int uplo, int N, const void *A, void *sclssq );
+void TCORE_clacpy( int uplo, int M, int N, const void *A, void *B );
+void TCORE_clange( int norm, int M, int N, const void *A, float *work, float *normA );
+void TCORE_clanhe( int norm, int uplo, int N, const void *A, float *work, float *normA );
+void TCORE_clansy( int norm, int uplo, int N, const void *A, float *work, float *normA );
+void TCORE_clantr( int norm, int uplo, int diag, int M, int N, const void *A, float *work, float *normA );
+int  TCORE_clascal( int uplo, int m, int n, void * alpha, void *A );
+void TCORE_claset( int uplo, int n1, int n2, void * alpha, void * beta, void *A );
+void TCORE_claset2( int uplo, int n1, int n2, void * alpha, void *A );
+int  TCORE_clatro( int uplo, int trans, int M, int N, const void *A, void *B );
+void TCORE_clauum( int uplo, int N, void *A );
+void TCORE_cplghe( float bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_cplgsy( void * bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_cplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_cpotrf( int uplo, int n, void *A, int *INFO );
+int  TCORE_cssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV );
+void TCORE_csymm( int side, int uplo, int M, int N, void * alpha, const void *A, const void *B, void * beta, void *C );
+void TCORE_csyrk( int uplo, int trans, int N, int K, void * alpha, const void *A, void * beta, void *C );
+void TCORE_csyr2k( int uplo, int trans, int N, int K, void * alpha, const void *A, const void *B, void * beta, void *C );
+int  TCORE_csyssq( int storev, int uplo, int N, const void *A, void *sclssq );
+int  TCORE_csytf2_nopiv( int uplo, int n, void *A );
+int  TCORE_ctplqt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK );
+int  TCORE_ctpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK );
+int  TCORE_ctpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK );
+int  TCORE_ctpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK );
+int  TCORE_ctradd( int uplo, int trans, int M, int N, void * alpha, const void *A, void * beta, void *B );
+void TCORE_ctrasm( int storev, int uplo, int diag, int M, int N, const void *A, float *work );
+void TCORE_ctrmm( int side, int uplo, int transA, int diag, int M, int N, void * alpha, const void *A, void *B );
+void TCORE_ctrsm( int side, int uplo, int transA, int diag, int M, int N, void * alpha, const void *A, void *B );
+int  TCORE_ctrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq );
+void TCORE_ctrtri( int uplo, int diag, int N, void *A, int *info );
+int  TCORE_ctsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork );
+int  TCORE_ctsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork );
+int  TCORE_ctstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, void *WORK, int LDWORK, int *INFO );
+int  TCORE_cunmlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, void *WORK, int LDWORK );
+int  TCORE_cunmqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, void *WORK, int LDWORK );
+int TCORE_cgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A );
+
+void TCORE_dzasum( int storev, int uplo, int M, int N, const void *A, double *work );
+int  TCORE_zaxpy( int M, void *alpha, const void *A, int incA, void *B, int incB );
+int  TCORE_zgeadd( int trans, int M, int N, void *alpha, const void *A, void *beta, void *B );
+int  TCORE_zgelqt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK );
+void TCORE_zgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, void *beta, void *C );
+int  TCORE_zgeqrt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK );
+int  TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A );
+int  TCORE_zgessq( int storev, int M, int N, const void *A, void *sclssq );
+int  TCORE_zgetrf( int M, int N, void *A, int *IPIV, int *INFO );
+int  TCORE_zgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO );
+int  TCORE_zgetrf_nopiv( int M, int N, int IB, void *A, int *INFO );
+void TCORE_zhe2ge( int uplo, int M, int N, const void *A, void *B );
+void TCORE_zhemm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, void *beta, void *C );
+void TCORE_zherk( int uplo, int trans, int N, int K, void *alpha, const void *A, double beta, void *C );
+void TCORE_zher2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C );
+int  TCORE_zherfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, void *WORK, int ldwork );
+int  TCORE_zhessq( int storev, int uplo, int N, const void *A, void *sclssq );
+void TCORE_zlacpy( int uplo, int M, int N, const void *A, void *B );
+void TCORE_zlange( int norm, int M, int N, const void *A, double *work, double *normA );
+void TCORE_zlanhe( int norm, int uplo, int N, const void *A, double *work, double *normA );
+void TCORE_zlansy( int norm, int uplo, int N, const void *A, double *work, double *normA );
+void TCORE_zlantr( int norm, int uplo, int diag, int M, int N, const void *A, double *work, double *normA );
+int  TCORE_zlascal( int uplo, int m, int n, void *alpha, void *A );
+void TCORE_zlaset( int uplo, int n1, int n2, void *alpha, void *beta, void *A );
+void TCORE_zlaset2( int uplo, int n1, int n2, void *alpha, void *A );
+int  TCORE_zlatro( int uplo, int trans, int M, int N, const void *A, void *B );
+void TCORE_zlauum( int uplo, int N, void *A );
+void TCORE_zplghe( double bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_zplgsy( void *bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_zplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed );
+void TCORE_zpotrf( int uplo, int n, void *A, int *INFO );
+int  TCORE_zssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV );
+void TCORE_zsymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, void *beta, void *C );
+void TCORE_zsyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, void *beta, void *C );
+void TCORE_zsyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, void *beta, void *C );
+int  TCORE_zsyssq( int storev, int uplo, int N, const void *A, void *sclssq );
+int  TCORE_zsytf2_nopiv( int uplo, int n, void *A );
+int  TCORE_ztplqt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK );
+int  TCORE_ztpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK );
+int  TCORE_ztpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK );
+int  TCORE_ztpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK );
+int  TCORE_ztradd( int uplo, int trans, int M, int N, void *alpha, const void *A, void *beta, void *B );
+void TCORE_ztrasm( int storev, int uplo, int diag, int M, int N, const void *A, double *work );
+void TCORE_ztrmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+void TCORE_ztrsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B );
+int  TCORE_ztrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq );
+void TCORE_ztrtri( int uplo, int diag, int N, void *A, int *info );
+int  TCORE_ztsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork );
+int  TCORE_ztsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork );
+int  TCORE_ztstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, void *WORK, int LDWORK, int *INFO );
+int  TCORE_zunmlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, void *WORK, int LDWORK );
+int  TCORE_zunmqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, void *WORK, int LDWORK );
+int  TCORE_zgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A );
+
+void CORE_slacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB);
+void CORE_dlacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB);
+void CORE_clacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB);
+void CORE_zlacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB);
+int CORE_splssq( int storev, int M, int N, void *sclssqin, void *sclssqout );
+int CORE_dplssq( int storev, int M, int N, void *sclssqin, void *sclssqout );
+int CORE_cplssq( int storev, int M, int N, void *sclssqin, void *sclssqout );
+int CORE_zplssq( int storev, int M, int N, void *sclssqin, void *sclssqout );
+int CORE_splssq2( int N, void *sclssq );
+int CORE_dplssq2( int N, void *sclssq );
+int CORE_cplssq2( int N, void *sclssq );
+int CORE_zplssq2( int N, void *sclssq );
+
+END_MODULE
diff --git a/doc/orgmode/chapters/installing.org b/doc/orgmode/chapters/installing.org
index 0193d3dad..7675856d6 100644
--- a/doc/orgmode/chapters/installing.org
+++ b/doc/orgmode/chapters/installing.org
@@ -61,6 +61,9 @@ we encourage users to use [[sec:spack][Spack]].
    sudo apt-get install -y libopenmpi-dev
    # Install hwloc (used by StarPU or QUARK, already a dependency of OpenMPI)
    sudo apt-get install -y libhwloc-dev
+   # install EZTrace, usefull to export some nice execution traces
+   with all runtimes
+   sudo apt-get install -y libeztrace-dev
    # install FxT, usefull to export some nice execution traces with StarPU
    sudo apt-get install -y libfxt-dev
    # Install cuda and cuBLAS: only if you have a GPU cuda compatible
@@ -209,17 +212,17 @@ we encourage users to use [[sec:spack][Spack]].
      *Caution about the compatibility:* Chameleon has been mainly tested
      with the QUARK library coming from https://github.com/ecrc/quark.
 
-**** FxT
-     [[http://download.savannah.gnu.org/releases/fkt/][FxT]] stands for both FKT (Fast Kernel Tracing) and FUT (Fast User
-     Tracing).  This library provides efficient support for recording
-     traces.  Chameleon can trace kernels execution on the different
-     workers and produce .paje files if FxT is enabled.  FxT can only
-     be used through StarPU and StarPU must be compiled with FxT
-     enabled, see how to use this feature here [[sec:trace][Execution trace using
-     StarPU]].
-
-     *Caution about the compatibility:* FxT should be compatible with
-     the version of StarPU used.
+**** EZTrace
+     This library provides efficient modules for recording
+     traces. Chameleon can trace kernels execution on CPU workers
+     thanks to EZTrace and produce .paje files. EZTrace also provides
+     integrated modules to trace MPI calls and/or memory usage. See
+     how to use this feature here [[sec:trace_ezt][Execution trace
+     using EZTrace]]. To trace kernels execution on all kind of
+     workers, such as CUDA, We recommend to use the internal tracing
+     support of the runtime system used done by the underlying
+     runtime.  See how to use this feature here [[sec:trace_fxt][Execution trace
+     using StarPU/FxT]].
 
 **** hwloc
      [[http://www.open-mpi.org/projects/hwloc/][hwloc]] (Portable Hardware Locality) is a software package for
@@ -394,11 +397,6 @@ we encourage users to use [[sec:spack][Spack]].
      * *CHAMELEON_ENABLE_TIMING=ON|OFF* (default ON): to control build
        of timing executables (performances check) contained in timing/
        sub-directory
-     * *CHAMELEON_ENABLE_TRACING=ON|OFF* (default OFF): to enable trace
-       generation during execution of timing drivers. It requires
-       StarPU to be linked with FxT library (trace execution of
-       kernels on workers), see also [[sec:trace][Execution tracing
-       with StarPU]].
      * *CHAMELEON_SIMULATION=ON|OFF* (default OFF): to enable
        simulation mode, means Chameleon will not really execute tasks,
        see details in section [[sec:simu][Use simulation mode with
diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org
index c00695dc2..9798c5831 100644
--- a/doc/orgmode/chapters/using.org
+++ b/doc/orgmode/chapters/using.org
@@ -129,28 +129,91 @@
      * GELS: solves overdetermined or underdetermined linear systems involving a general matrix using the QR or the LQ factorization
      * GESVD: general matrix singular value decomposition
 
-*** Execution trace using StarPU
-    <<sec:trace>>
+*** Execution trace using EZTrace
+    <<sec:trace_ezt>>
+
+    [[http://eztrace.gforge.inria.fr/support.html][EZTrace]] can be used by chameleon to generate traces. Two modules
+    are automatically generated as soon as EZTrace is detected on the
+    system. The first one (which is recommended) is the
+    ~chameleon_tcore~ module. It traces all the ~TCORE_...()~ functions
+    that are called by the codelets of all the runtime but PaRSEC. The
+    second one is the ~chameleon_core~ module which traces the lower
+    level ~CORE_...()~ functions. If using PaRSEC, you need to use this
+    module to generate the traces.
+
+    To generate traces with EZTrace, you need first to compile with
+    *-DBUILD_SHARED_LIBS=ON*. EZTrace is using weak symbols to overload
+    function calls with ld_preload and enable trace generation. Then,
+    either you install the ~libeztrace-*.so~ files into the EZTrace
+    install directory, or you can add the path of the modules to your
+    environement
+    #+begin_src
+    export EZTRACE_LIBRARY_PATH=/path/to/your/modules
+    #+end_src
+
+    To check if the modules are available you should have
+    #+begin_src
+    $ eztrace_avail
+    1	omp	Module for OpenMP parallel regions
+    2	pthread	Module for PThread synchronization functions (mutex, semaphore, spinlock, etc.)
+    3	stdio	Module for stdio functions (read, write, select, poll, etc.)
+    4	mpi	Module for MPI functions
+    5	memory	Module for memory functions (malloc, free, etc.)
+    6	papi	Module for PAPI Performance counters
+    128	chameleon_core	Module for Chameleon CORE functions
+    129	chameleon_tcore	Module for Chameleon TCORE functions
+    #+end_src
+
+    Then, you can restrict the modules used during the execution
+    #+begin_src
+    export EZTRACE_TRACE="mpi chameleon_tcore"
+    #+end_src
+
+    _The module ~mpi~ is required if you want to run in distributed._
+
+    The setup can be checked with ~eztrace_loaded~
+    #+begin_src
+    $ eztrace_loaded
+    4	mpi	Module for MPI functions
+    129	chameleon_tcore	Module for Chameleon TCORE functions
+    #+end_src
+
+    To generate the traces, you need to run your binary through
+    eztrace:
+    #+begin_src
+    eztrace ./dnew-testing -o gemm -n 1000 -b 200
+    mpirun -np 4 eztrace ./dnew-testing -o gemm -n 1000 -b 200 -P 2
+    #+end_src
+
+    Convert the binary files into a ~.trace~ file, and visualize it.
+    #+begin_src
+    eztrace_convert <username>_eztrace_log_rank_<[0-9]*>
+    vite eztrace_output.trace
+    #+end_src
+
+    For more information on EZTrace, you can follow the [[http://eztrace.gforge.inria.fr/support.html][support page]].
+
+*** Execution trace using StarPU/FxT
+    <<sec:trace_fxt>>
 
     StarPU can generate its own trace log files by compiling it with
     the ~--with-fxt~ option at the configure step (you can have to
     specify the directory where you installed FxT by giving
     ~--with-fxt=...~ instead of ~--with-fxt~ alone).  By doing so, traces
     are generated after each execution of a program which uses StarPU
-    in the directory pointed by the STARPU_FXT_PREFIX environment
+    in the directory pointed by the [[http://starpu.gforge.inria.fr/doc/html/ExecutionConfigurationThroughEnvironmentVariables.html][STARPU_FXT_PREFIX]] environment
     variable.
     #+begin_example
     export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/
     #+end_example
     When executing a ~./timing/...~ Chameleon program, if it has been
-    enabled (StarPU compiled with FxT and
-    *-DCHAMELEON_ENABLE_TRACING=ON*), you can give the option ~--trace~ to
-    tell the program to generate trace log files.
+    enabled (StarPU compiled with FxT), the program will generate
+    trace files in the directory $STARPU_FXT_PREFIX.
 
     Finally, to generate the trace file which can be opened with [[http://vite.gforge.inria.fr/][Vite]]
     program, you can use the *starpu_fxt_tool* executable of StarPU.
-    This tool should be in ~$STARPU_INSTALL_REPOSITORY/bin~.  You can
-    use it to generate the trace file like this:
+    This tool should be in the bin directory of StarPU's installation.
+    You can use it to generate the trace file like this:
     #+begin_src
     path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename
     #+end_src
diff --git a/example/link_chameleon/CMakeLists.txt b/example/link_chameleon/CMakeLists.txt
index 6e0dac9e9..89a16531d 100644
--- a/example/link_chameleon/CMakeLists.txt
+++ b/example/link_chameleon/CMakeLists.txt
@@ -49,7 +49,7 @@ if (CHAMELEON_DISTRIB_DIR)
     if (CHAMELEON_CHAMELEON_USE_QUARK)
         find_package(CHAMELEON COMPONENTS QUARK)
     else()
-        find_package(CHAMELEON COMPONENTS STARPU MPI CUDA FXT)
+        find_package(CHAMELEON COMPONENTS STARPU MPI CUDA)
     endif()
     if (CHAMELEON_FOUND)
         link_directories(${CHAMELEON_LIBRARY_DIRS_DEP})
diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in
index 1596315d8..8432d7a93 100644
--- a/include/chameleon/config.h.in
+++ b/include/chameleon/config.h.in
@@ -47,9 +47,6 @@
 /* Simulation */
 #cmakedefine CHAMELEON_SIMULATION
 
-/* Tracing support */
-#cmakedefine CHAMELEON_ENABLE_TRACING
-
 /* getopt */
 #cmakedefine CHAMELEON_HAVE_GETOPT_H
 #cmakedefine CHAMELEON_HAVE_GETOPT_LONG
diff --git a/new-testing/testing_zauxiliary.c b/new-testing/testing_zauxiliary.c
index da33718fa..16da205d6 100644
--- a/new-testing/testing_zauxiliary.c
+++ b/new-testing/testing_zauxiliary.c
@@ -30,13 +30,16 @@ struct option;
  * @brief Defines all the parameters of the testings
  */
 static parameter_t parameters[] = {
+    /* Name, helper, shname, flags, has_arg, psize, valtype, value, vallist, read, sprint */
     { "id", "Id of the run", 0, PARAM_OUTPUT, 0, 3, TestValInt, {0}, NULL, NULL, sprint_int },
 
     { NULL, "Options", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL },
-    { "help",  "Show this help",                      'h', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
-    { "check", "Enable checking of the result",       'c', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
-    { "human", "Enable human readable mode",          'H', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
-    { "niter", "Perform multiple iteration per test", 'l', PARAM_OPTION, 1, 0, TestValInt, {1}, NULL, pread_int, sprint_int },
+    { "help",     "Show this help",                           'h', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "check",    "Enable checking of the result",            'c', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "human",    "Enable human readable mode",               'H', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "niter",    "Perform multiple iteration per test",      'l', PARAM_OPTION, 1, 0, TestValInt, {1}, NULL, pread_int, sprint_int },
+    { "trace",    "Enable the trace generation",              -30, PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
+    { "nowarmup", "Disable the warmup run to load libraries", -31, PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int },
 
     { NULL, "Machine parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL },
     { "threads", "Number of CPU workers per node",      't', PARAM_OPTION | PARAM_OUTPUT, 1, 7, TestValInt, {1}, NULL, pread_int, sprint_int },
@@ -476,6 +479,7 @@ parameters_destroy()
 int main (int argc, char **argv) {
 
     int ncores, ngpus, human, check, i, niter;
+    int trace, nowarmup;
     int rc, info = 0;
     int run_id = 0;
     char *func_name;
@@ -491,12 +495,14 @@ int main (int argc, char **argv) {
         parameters_read_file( input_file );
         free(input_file);
     }
-    ncores     = parameters_getvalue_int( "threads" );
-    ngpus      = parameters_getvalue_int( "gpus" );
-    check      = parameters_getvalue_int( "check" );
-    human      = parameters_getvalue_int( "human" );
-    func_name  = parameters_getvalue_str( "op" );
-    niter      = parameters_getvalue_int( "niter" );
+    ncores    = parameters_getvalue_int( "threads"  );
+    ngpus     = parameters_getvalue_int( "gpus"     );
+    check     = parameters_getvalue_int( "check"    );
+    human     = parameters_getvalue_int( "human"    );
+    func_name = parameters_getvalue_str( "op"       );
+    niter     = parameters_getvalue_int( "niter"    );
+    trace     = parameters_getvalue_int( "trace"    );
+    nowarmup  = parameters_getvalue_int( "nowarmup" );
 
     CHAMELEON_Init( ncores, ngpus );
 
@@ -510,6 +516,20 @@ int main (int argc, char **argv) {
     /* Executes the tests */
     run_print_header( test, check, human );
     run = runlist->head;
+
+    /* Warmup */
+    if ( !nowarmup ) {
+        run_arg_list_t copy = run_arg_list_copy( &(run->args) );
+        rc = test->fptr( &copy, check );
+        run_arg_list_destroy( &copy );
+    }
+
+    /* Start tracing */
+    if ( trace ) {
+        CHAMELEON_Enable( CHAMELEON_PROFILING_MODE );
+    }
+
+    /* Perform all runs */
     while ( run != NULL ) {
         for(i=0; i<niter; i++) {
             run_arg_list_t copy = run_arg_list_copy( &(run->args) );
@@ -530,6 +550,12 @@ int main (int argc, char **argv) {
         run_list_destroy( run );
         run = next;
     }
+
+    /* Stop tracing */
+    if ( trace ) {
+        CHAMELEON_Disable( CHAMELEON_PROFILING_MODE );
+    }
+
     free( runlist );
 
     CHAMELEON_Finalize();
diff --git a/new-testing/testing_zauxiliary.h b/new-testing/testing_zauxiliary.h
index 2be0d88c5..7b35505f8 100644
--- a/new-testing/testing_zauxiliary.h
+++ b/new-testing/testing_zauxiliary.h
@@ -23,47 +23,6 @@
 
 #include "testings.h"
 
-/**
- *
- * Macro for trace generation
- *
- */
-#define START_TRACING()                         \
-    RUNTIME_start_stats();                      \
-    if(iparam[IPARAM_TRACE] == 2) {             \
-    	RUNTIME_start_profiling();              \
-    }                                           \
-    if(iparam[IPARAM_BOUND]) {                  \
-        CHAMELEON_Enable(CHAMELEON_BOUND);      \
-    }
-
-#define STOP_TRACING()                          \
-    RUNTIME_stop_stats();                       \
-    if(iparam[IPARAM_TRACE] == 2) {             \
-    	RUNTIME_stop_profiling();               \
-    }                                           \
-    if(iparam[IPARAM_BOUND]) {                  \
-        CHAMELEON_Disable(CHAMELEON_BOUND);     \
-    }
-
-/**
- *
- * Macro for DAG generation
- *
- */
-#if 0
-#define START_DAG()                   \
-    if ( iparam[IPARAM_DAG] == 2 )    \
-        CHAMELEON_Enable(CHAMELEON_DAG);
-
-#define STOP_DAG()                    \
-    if ( iparam[IPARAM_DAG] == 2 )    \
-        CHAMELEON_Disable(CHAMELEON_DAG);
-#else
-#define START_DAG()  do {} while(0);
-#define STOP_DAG()   do {} while(0);
-#endif
-
 /**
  *
  * Synchro for distributed computations
@@ -82,23 +41,6 @@
  * General Macros for timing
  *
  */
-/* #define START_TIMING()                          \ */
-/*     START_DAG();                                \ */
-/*     START_TRACING();                            \ */
-/*     START_DISTRIBUTED();                        \ */
-/*     t = -RUNTIME_get_time(); */
-
-/* #define STOP_TIMING()                           \ */
-/*     STOP_DISTRIBUTED();                         \ */
-/*     t += RUNTIME_get_time();                    \ */
-/*     STOP_TRACING();                             \ */
-/*     STOP_DAG();                                 \ */
-/*     if (iparam[IPARAM_PROFILE] == 2) {          \ */
-/*         RUNTIME_kernelprofile_display();        \ */
-/*         RUNTIME_schedprofile_display();         \ */
-/*     }                                           \ */
-/*     *t_ = t; */
-
 #define START_TIMING( _t_ )                     \
     START_DISTRIBUTED();                        \
     (_t_) = RUNTIME_get_time();
diff --git a/new-testing/testing_zgetrf.c b/new-testing/testing_zgetrf.c
index 68491ba58..ebfea504e 100644
--- a/new-testing/testing_zgetrf.c
+++ b/new-testing/testing_zgetrf.c
@@ -55,8 +55,6 @@ testing_zgetrf( run_arg_list_t *args, int check )
     run_arg_add_fixdbl( args, "time", t );
     run_arg_add_fixdbl( args, "gflops", ( hres == CHAMELEON_SUCCESS ) ? gflops : -1. );
 
-    fprintf( stdout, "hres = %d\n", hres );
-
     /* Checks the factorisation and residue */
     if ( check ) {
         CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, NULL );
diff --git a/runtime/openmp/control/runtime_profiling.c b/runtime/openmp/control/runtime_profiling.c
index dfb9e2579..b243c9d4d 100644
--- a/runtime/openmp/control/runtime_profiling.c
+++ b/runtime/openmp/control/runtime_profiling.c
@@ -27,12 +27,10 @@ double RUNTIME_get_time(){
 
 void RUNTIME_start_profiling()
 {
-    chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with OpenMP\n");
 }
 
 void RUNTIME_stop_profiling()
 {
-    chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with OpenMP\n");
 }
 
 void RUNTIME_start_stats()
diff --git a/runtime/parsec/control/runtime_profiling.c b/runtime/parsec/control/runtime_profiling.c
index 870233220..c8f970005 100644
--- a/runtime/parsec/control/runtime_profiling.c
+++ b/runtime/parsec/control/runtime_profiling.c
@@ -26,12 +26,10 @@ double RUNTIME_get_time(){
 
 void RUNTIME_start_profiling()
 {
-    chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with PaRSEC\n");
 }
 
 void RUNTIME_stop_profiling()
 {
-    chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with PaRSEC\n");
 }
 
 void RUNTIME_start_stats()
diff --git a/runtime/quark/control/runtime_profiling.c b/runtime/quark/control/runtime_profiling.c
index 685a81c40..10da952b0 100644
--- a/runtime/quark/control/runtime_profiling.c
+++ b/runtime/quark/control/runtime_profiling.c
@@ -26,12 +26,10 @@ double RUNTIME_get_time(){
 
 void RUNTIME_start_profiling()
 {
-    chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with Quark\n");
 }
 
 void RUNTIME_stop_profiling()
 {
-    chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with Quark\n");
 }
 
 void RUNTIME_start_stats()
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index 14659f785..a0f05c045 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -21,7 +21,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "chameleon_starpu.h"
-#if defined(HAVE_STARPU_FXT_PROFILING)
+#if defined(STARPU_USE_FXT)
 #include <starpu_fxt.h>
 #endif
 
@@ -40,7 +40,7 @@ static int chameleon_starpu_init( starpu_conf_t *conf )
         MPI_Initialized( &flag );
 #  endif
 
-#if defined(HAVE_STARPU_FXT_PROFILING)
+#if defined(STARPU_USE_FXT)
         starpu_fxt_autostart_profiling(0);
 #endif
 
diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c
index 78d2394ef..3eb14c216 100644
--- a/runtime/starpu/control/runtime_profiling.c
+++ b/runtime/starpu/control/runtime_profiling.c
@@ -20,7 +20,7 @@
  */
 #include <math.h>
 #include "chameleon_starpu.h"
-#if defined(HAVE_STARPU_FXT_PROFILING)
+#if defined(STARPU_USE_FXT)
 #include <starpu_fxt.h>
 #endif
 
@@ -55,18 +55,14 @@ void RUNTIME_iteration_pop( CHAM_context_t *chamctxt )
 }
 
 void RUNTIME_start_profiling(){
-#if defined(HAVE_STARPU_FXT_PROFILING)
+#if defined(STARPU_USE_FXT)
     starpu_fxt_start_profiling();
-#else
-    fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
 #endif
 }
 
 void RUNTIME_stop_profiling(){
-#if defined(HAVE_STARPU_FXT_PROFILING)
+#if defined(STARPU_USE_FXT)
     starpu_fxt_stop_profiling();
-#else
-    fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n");
 #endif
 }
 
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 6cf49261c..223e5d045 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -24,7 +24,6 @@
 #include "chameleon/config.h"
 
 /* StarPU options */
-#cmakedefine HAVE_STARPU_FXT_PROFILING
 #cmakedefine HAVE_STARPU_IDLE_PREFETCH
 #cmakedefine HAVE_STARPU_ITERATION_PUSH
 #cmakedefine HAVE_STARPU_DATA_WONT_USE
-- 
GitLab