From 250c261e9e502cf179a1c468ba10e46d53404e9d Mon Sep 17 00:00:00 2001 From: PRUVOST Florent <florent.pruvost@inria.fr> Date: Fri, 13 Dec 2019 15:10:42 +0100 Subject: [PATCH] Remove the FXT starpu option and the enable tracaing. It is now done by eztrace, or directly with STARPU_USE_FXT --- CMakeLists.txt | 37 --- compute/CMakeLists.txt | 5 - control/context.c | 2 + coreblas/eztrace_module/CMakeLists.txt | 61 ++--- ...ce_module => coreblas_core_eztrace_module} | 4 +- .../coreblas_tcore_eztrace_module | 224 ++++++++++++++++++ doc/orgmode/chapters/installing.org | 30 ++- doc/orgmode/chapters/using.org | 79 +++++- example/link_chameleon/CMakeLists.txt | 2 +- include/chameleon/config.h.in | 3 - new-testing/testing_zauxiliary.c | 46 +++- new-testing/testing_zauxiliary.h | 58 ----- new-testing/testing_zgetrf.c | 2 - runtime/openmp/control/runtime_profiling.c | 2 - runtime/parsec/control/runtime_profiling.c | 2 - runtime/quark/control/runtime_profiling.c | 2 - runtime/starpu/control/runtime_control.c | 4 +- runtime/starpu/control/runtime_profiling.c | 10 +- runtime/starpu/include/chameleon_starpu.h.in | 1 - 19 files changed, 386 insertions(+), 188 deletions(-) rename coreblas/eztrace_module/{coreblas_eztrace_module => coreblas_core_eztrace_module} (99%) create mode 100644 coreblas/eztrace_module/coreblas_tcore_eztrace_module diff --git a/CMakeLists.txt b/CMakeLists.txt index 33bce25ca..27a355f8b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -184,13 +184,6 @@ if (CHAMELEON_ENABLE_CUDA AND NOT CHAMELEON_USE_CUDA) message("-- ${BoldGreen}CHAMELEON_USE_CUDA is set to OFF, turn it ON to use CUDA (unsupported by Quark)${ColourReset}") endif() -# Enable FXT if StarPU -option(CHAMELEON_ENABLE_TRACING "Enable tracing support" OFF) -if (NOT CHAMELEON_ENABLE_TRACING) - message("-- ${BoldGreen}CHAMELEON_ENABLE_TRACING is set to OFF, turn it ON to use FxT (with StarPU)${ColourReset}") -endif() -#option(CHAMELEON_USE_EZTRACE "Enable EZTRACE to build modules" OFF) - option(CHAMELEON_RUNTIME_SYNC "Enable synchronous task submission when available to debug the code without parallelism" OFF) if (CHAMELEON_RUNTIME_SYNC) message("-- ${BoldGreen}CHAMELEON_RUNTIME_SYNC is set to ON, turn it OFF to avoid synchronisation in the tasks submission${ColourReset}") @@ -605,9 +598,6 @@ if( CHAMELEON_SCHED_STARPU ) if(CHAMELEON_USE_MPI) list(APPEND STARPU_COMPONENT_LIST "MPI") endif() - if(CHAMELEON_ENABLE_TRACING) - list(APPEND STARPU_COMPONENT_LIST "FXT") - endif() find_package(STARPU ${CHAMELEON_STARPU_VERSION} REQUIRED COMPONENTS ${STARPU_COMPONENT_LIST}) @@ -673,18 +663,6 @@ if( CHAMELEON_SCHED_STARPU ) set(CHAMELEON_USE_MIGRATE "OFF") message("-- ${Blue}CHAMELEON_USE_MIGRATE is turned OFF because starpu_mpi_data_migrate not found${ColourReset}") endif() - if(CHAMELEON_ENABLE_TRACING) - # check if fxt profiling is accessible in starpu and activate it in chameleon - check_function_exists(starpu_fxt_start_profiling HAVE_STARPU_FXT_PROFILING) - if ( HAVE_STARPU_FXT_PROFILING ) - message("-- ${Blue}Add definition HAVE_STARPU_FXT_PROFILING" - " - Activate FxT profiling through StarPU${ColourReset}") - else() - message("-- ${Red}Looking for starpu with fxt" - " - starpu_fxt_start_profiling() test fails in StarPU${ColourReset}") - message("-- ${Red}Check in CMakeFiles/CMakeError.log to figure out why it fails${ColourReset}") - endif() - endif() if (CHAMELEON_USE_MPI) # Check if a specific function exist check_function_exists(starpu_mpi_data_register_comm HAVE_STARPU_MPI_DATA_REGISTER) @@ -716,14 +694,6 @@ if( CHAMELEON_SCHED_STARPU ) Print_Find_Library_Status(hwloc libhwloc) endif () endif() - if(CHAMELEON_ENABLE_TRACING AND (NOT FXT_FOUND OR NOT FXT_LIBRARIES)) - if (NOT FXT_fxt.h_DIRS) - Print_Find_Header_Status(fxt fxt.h) - endif () - if (NOT FXT_fxt_LIBRARY) - Print_Find_Library_Status(fxt libfxt) - endif () - endif() if(CHAMELEON_SIMULATION AND (NOT SIMGRID_FOUND OR NOT SIMGRID_LIBRARIES)) if (NOT SIMGRID_simgrid.h_DIRS) Print_Find_Header_Status(simgrid simgrid.h) @@ -758,9 +728,6 @@ if( CHAMELEON_SCHED_STARPU ) if(CHAMELEON_SIMULATION AND (NOT SIMGRID_FOUND OR NOT SIMGRID_LIBRARIES)) message(FATAL_ERROR "SimGrid library is required but has not been found") endif() - if(CHAMELEON_ENABLE_TRACING AND (NOT FXT_FOUND OR NOT FXT_LIBRARIES)) - message(FATAL_ERROR "FxT library is required but has not been found") - endif() if( (NOT STARPU_SHM_FOUND) OR (NOT STARPU_SHM_LIBRARIES) OR ( STARPU_LOOK_FOR_MPI AND (NOT STARPU_MPI_FOUND OR NOT STARPU_MPI_LIBRARIES) ) ) @@ -780,10 +747,6 @@ if( CHAMELEON_SCHED_PARSEC ) if(CHAMELEON_USE_CUDA) list(APPEND PARSEC_COMPONENT_LIST "CUDA") endif() - # TODO: Add a CHAMELEON_WITH_PROFILING option that enables Fxt for StarPU, or PAPI for PaRSEC - #if(CHAMELEON_WITH_PROFILING) - # list(APPEND PARSEC_COMPONENT_LIST "PAPI") - #endif() find_package(PARSEC COMPONENTS ${PARSEC_COMPONENT_LIST}) if(PARSEC_FOUND) message("-- ${Blue}Add definition CHAMELEON_SCHED_PARSEC" diff --git a/compute/CMakeLists.txt b/compute/CMakeLists.txt index 3ca2aaaa1..b49870887 100644 --- a/compute/CMakeLists.txt +++ b/compute/CMakeLists.txt @@ -304,11 +304,6 @@ elseif(CHAMELEON_SCHED_QUARK) elseif(CHAMELEON_SCHED_OPENMP) target_link_libraries(chameleon chameleon_openmp) endif() -if (NOT CHAMELEON_SIMULATION) - # Depends on coreblas only for set_coreblas_gemm3m_enabled() (Maybe we should change that) - add_dependencies(chameleon coreblas_include) - target_link_libraries(chameleon coreblas) -endif() target_link_libraries(chameleon hqr) add_dependencies(chameleon diff --git a/control/context.c b/control/context.c index 4e0a6672b..759716782 100644 --- a/control/context.c +++ b/control/context.c @@ -148,6 +148,7 @@ int CHAMELEON_Enable(int option) break; case CHAMELEON_PROFILING_MODE: chamctxt->profiling_enabled = CHAMELEON_TRUE; + RUNTIME_start_profiling(); break; case CHAMELEON_PROGRESS: chamctxt->progress_enabled = CHAMELEON_TRUE; @@ -218,6 +219,7 @@ int CHAMELEON_Disable(int option) break; case CHAMELEON_PROFILING_MODE: chamctxt->profiling_enabled = CHAMELEON_FALSE; + RUNTIME_stop_profiling(); break; case CHAMELEON_PROGRESS: chamctxt->progress_enabled = CHAMELEON_FALSE; diff --git a/coreblas/eztrace_module/CMakeLists.txt b/coreblas/eztrace_module/CMakeLists.txt index 810bab667..2850ca003 100644 --- a/coreblas/eztrace_module/CMakeLists.txt +++ b/coreblas/eztrace_module/CMakeLists.txt @@ -39,36 +39,39 @@ if (EZTRACE_FOUND AND EZTRACE_DIR_FOUND) set(EZTRACE_CREATE_PLUGIN "${EZTRACE_eztrace_create_plugin_DIR}/eztrace_create_plugin") - add_custom_command( - OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/output - COMMAND ${EZTRACE_CREATE_PLUGIN} - ARGS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_eztrace_module - DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_eztrace_module - ) - add_custom_target( - eztrace-module-chameleon_core-dir ALL - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output - ) - add_custom_command( - OUTPUT libeztrace-convert-chameleon_core.so - COMMAND make - WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/output - DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/output - ) - add_custom_target( - eztrace-module-chameleon_core-libs ALL - DEPENDS libeztrace-convert-chameleon_core.so - ) - # installation - # ------------ - install( - FILES - ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-autostart-chameleon_core.so - ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-chameleon_core.so - ${CMAKE_CURRENT_BINARY_DIR}/output/libeztrace-convert-chameleon_core.so - DESTINATION ${EZTRACE_LIBRARY_DIRS} - ) + set( COREBLAS_EZTRACE_MODULES core tcore ) + foreach( _module ${COREBLAS_EZTRACE_MODULES} ) + add_custom_command( + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/${_module} + COMMAND ${EZTRACE_CREATE_PLUGIN} -o ${CMAKE_CURRENT_BINARY_DIR}/${_module} > /dev/null + ARGS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_${_module}_eztrace_module + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/coreblas_${_module}_eztrace_module + ) + add_custom_target( + eztrace-module-chameleon_${_module}-dir ALL + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${_module} + ) + add_custom_command( + OUTPUT libeztrace-convert-chameleon_${_module}.so + COMMAND make + WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/${_module} + DEPENDS ${CMAKE_CURRENT_BINARY_DIR}/${_module} + ) + add_custom_target( + eztrace-module-chameleon_${_module}-libs ALL + DEPENDS libeztrace-convert-chameleon_${_module}.so + ) + # installation + # ------------ + install( + FILES + ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-autostart-chameleon_${_module}.so + ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-chameleon_${_module}.so + ${CMAKE_CURRENT_BINARY_DIR}/${_module}/libeztrace-convert-chameleon_${_module}.so + DESTINATION ${EZTRACE_LIBRARY_DIRS} + ) + endforeach() endif (EZTRACE_eztrace_create_plugin_DIR) endif (EZTRACE_FOUND AND EZTRACE_DIR_FOUND) diff --git a/coreblas/eztrace_module/coreblas_eztrace_module b/coreblas/eztrace_module/coreblas_core_eztrace_module similarity index 99% rename from coreblas/eztrace_module/coreblas_eztrace_module rename to coreblas/eztrace_module/coreblas_core_eztrace_module index c93f7a3a4..a9e517548 100644 --- a/coreblas/eztrace_module/coreblas_eztrace_module +++ b/coreblas/eztrace_module/coreblas_core_eztrace_module @@ -1,9 +1,7 @@ BEGIN_MODULE NAME chameleon_core DESC "Module for Chameleon CORE functions" -ID 7770 - -#include <chameleon.h> +ID 0 void CORE_scasum(int storev, int uplo, int M, int N, void *A, int lda, float *work); diff --git a/coreblas/eztrace_module/coreblas_tcore_eztrace_module b/coreblas/eztrace_module/coreblas_tcore_eztrace_module new file mode 100644 index 000000000..18698b8a1 --- /dev/null +++ b/coreblas/eztrace_module/coreblas_tcore_eztrace_module @@ -0,0 +1,224 @@ +BEGIN_MODULE +NAME chameleon_tcore +DESC "Module for Chameleon TCORE functions" +ID 1 + + +void TCORE_sasum( int storev, int uplo, int M, int N, const void *A, float *work ); +int TCORE_saxpy( int M, void *alpha, const void *A, int incA, void *B, int incB ); +int TCORE_sgeadd( int trans, int M, int N, void *alpha, const void *A, float beta, void *B ); +int TCORE_sgelqt( int M, int N, int IB, void *A, void *T, float *TAU, float *WORK ); +void TCORE_sgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, float beta, void *C ); +int TCORE_sgeqrt( int M, int N, int IB, void *A, void *T, float *TAU, float *WORK ); +int TCORE_sgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A ); +int TCORE_sgessq( int storev, int M, int N, const void *A, void *sclssq ); +int TCORE_sgetrf( int M, int N, void *A, int *IPIV, int *INFO ); +int TCORE_sgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO ); +int TCORE_sgetrf_nopiv( int M, int N, int IB, void *A, int *INFO ); +void TCORE_she2ge( int uplo, int M, int N, const void *A, void *B ); +int TCORE_ssyrfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, float *WORK, int ldwork ); +void TCORE_slacpy( int uplo, int M, int N, const void *A, void *B ); +void TCORE_slange( int norm, int M, int N, const void *A, float *work, float *normA ); +void TCORE_slansy( int norm, int uplo, int N, const void *A, float *work, float *normA ); +void TCORE_slantr( int norm, int uplo, int diag, int M, int N, const void *A, float *work, float *normA ); +int TCORE_slascal( int uplo, int m, int n, void *alpha, void *A ); +void TCORE_slaset( int uplo, int n1, int n2, void *alpha, float beta, void *A ); +void TCORE_slaset2( int uplo, int n1, int n2, void *alpha, void *A ); +int TCORE_slatro( int uplo, int trans, int M, int N, const void *A, void *B ); +void TCORE_slauum( int uplo, int N, void *A ); +void TCORE_splgsy( float bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_splrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_spotrf( int uplo, int n, void *A, int *INFO ); +int TCORE_sssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV ); +void TCORE_ssymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, float beta, void *C ); +void TCORE_ssyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, float beta, void *C ); +void TCORE_ssyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, float beta, void *C ); +int TCORE_ssyssq( int storev, int uplo, int N, const void *A, void *sclssq ); +int TCORE_ssytf2_nopiv( int uplo, int n, void *A ); +int TCORE_stplqt( int M, int N, int L, int IB, void *A, void *B, void *T, float *WORK ); +int TCORE_stpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, float *WORK ); +int TCORE_stpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, float *WORK ); +int TCORE_stpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, float *WORK ); +int TCORE_stradd( int uplo, int trans, int M, int N, void *alpha, const void *A, float beta, void *B ); +void TCORE_strasm( int storev, int uplo, int diag, int M, int N, const void *A, float *work ); +void TCORE_strmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +void TCORE_strsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +int TCORE_strssq( int uplo, int diag, int M, int N, const void *A, void *sclssq ); +void TCORE_strtri( int uplo, int diag, int N, void *A, int *info ); +int TCORE_stsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, float *WORK, int ldwork ); +int TCORE_stsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, float *WORK, int ldwork ); +int TCORE_ststrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, float *WORK, int LDWORK, int *INFO ); +int TCORE_sormlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, float *WORK, int LDWORK ); +int TCORE_sormqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, float *WORK, int LDWORK ); +int TCORE_sgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A ); + +void TCORE_dasum( int storev, int uplo, int M, int N, const void *A, double *work ); +int TCORE_daxpy( int M, void *alpha, const void *A, int incA, void *B, int incB ); +int TCORE_dgeadd( int trans, int M, int N, void *alpha, const void *A, double beta, void *B ); +int TCORE_dgelqt( int M, int N, int IB, void *A, void *T, double *TAU, double *WORK ); +void TCORE_dgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C ); +int TCORE_dgeqrt( int M, int N, int IB, void *A, void *T, double *TAU, double *WORK ); +int TCORE_dgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A ); +int TCORE_dgessq( int storev, int M, int N, const void *A, void *sclssq ); +int TCORE_dgetrf( int M, int N, void *A, int *IPIV, int *INFO ); +int TCORE_dgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO ); +int TCORE_dgetrf_nopiv( int M, int N, int IB, void *A, int *INFO ); +void TCORE_dhe2ge( int uplo, int M, int N, const void *A, void *B ); +int TCORE_dsyrfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, double *WORK, int ldwork ); +void TCORE_dlacpy( int uplo, int M, int N, const void *A, void *B ); +void TCORE_dlange( int norm, int M, int N, const void *A, double *work, double *normA ); +void TCORE_dlansy( int norm, int uplo, int N, const void *A, double *work, double *normA ); +void TCORE_dlantr( int norm, int uplo, int diag, int M, int N, const void *A, double *work, double *normA ); +int TCORE_dlascal( int uplo, int m, int n, void *alpha, void *A ); +void TCORE_dlaset( int uplo, int n1, int n2, void *alpha, double beta, void *A ); +void TCORE_dlaset2( int uplo, int n1, int n2, void *alpha, void *A ); +int TCORE_dlatro( int uplo, int trans, int M, int N, const void *A, void *B ); +void TCORE_dlauum( int uplo, int N, void *A ); +void TCORE_dplgsy( double bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_dplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_dpotrf( int uplo, int n, void *A, int *INFO ); +int TCORE_dssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV ); +void TCORE_dsymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, double beta, void *C ); +void TCORE_dsyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, double beta, void *C ); +void TCORE_dsyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C ); +int TCORE_dsyssq( int storev, int uplo, int N, const void *A, void *sclssq ); +int TCORE_dsytf2_nopiv( int uplo, int n, void *A ); +int TCORE_dtplqt( int M, int N, int L, int IB, void *A, void *B, void *T, double *WORK ); +int TCORE_dtpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, double *WORK ); +int TCORE_dtpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, double *WORK ); +int TCORE_dtpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, double *WORK ); +int TCORE_dtradd( int uplo, int trans, int M, int N, void *alpha, const void *A, double beta, void *B ); +void TCORE_dtrasm( int storev, int uplo, int diag, int M, int N, const void *A, double *work ); +void TCORE_dtrmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +void TCORE_dtrsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +int TCORE_dtrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq ); +void TCORE_dtrtri( int uplo, int diag, int N, void *A, int *info ); +int TCORE_dtsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, double *WORK, int ldwork ); +int TCORE_dtsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, double *WORK, int ldwork ); +int TCORE_dtstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, double *WORK, int LDWORK, int *INFO ); +int TCORE_dormlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, double *WORK, int LDWORK ); +int TCORE_dormqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, double *WORK, int LDWORK ); +int TCORE_dgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A ); + +void TCORE_scasum( int storev, int uplo, int M, int N, const void *A, float *work ); +int TCORE_caxpy( int M, void * alpha, const void *A, int incA, void *B, int incB ); +int TCORE_cgeadd( int trans, int M, int N, void * alpha, const void *A, void * beta, void *B ); +int TCORE_cgelqt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK ); +void TCORE_cgemm( int transA, int transB, int M, int N, int K, void * alpha, const void *A, const void *B, void * beta, void *C ); +int TCORE_cgeqrt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK ); +int TCORE_cgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A ); +int TCORE_cgessq( int storev, int M, int N, const void *A, void *sclssq ); +int TCORE_cgetrf( int M, int N, void *A, int *IPIV, int *INFO ); +int TCORE_cgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO ); +int TCORE_cgetrf_nopiv( int M, int N, int IB, void *A, int *INFO ); +void TCORE_che2ge( int uplo, int M, int N, const void *A, void *B ); +void TCORE_chemm( int side, int uplo, int M, int N, void * alpha, const void *A, const void *B, void * beta, void *C ); +void TCORE_cherk( int uplo, int trans, int N, int K, void *alpha, const void *A, float beta, void *C ); +void TCORE_cher2k( int uplo, int trans, int N, int K, void * alpha, const void *A, const void *B, float beta, void *C ); +int TCORE_cherfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, void *WORK, int ldwork ); +int TCORE_chessq( int storev, int uplo, int N, const void *A, void *sclssq ); +void TCORE_clacpy( int uplo, int M, int N, const void *A, void *B ); +void TCORE_clange( int norm, int M, int N, const void *A, float *work, float *normA ); +void TCORE_clanhe( int norm, int uplo, int N, const void *A, float *work, float *normA ); +void TCORE_clansy( int norm, int uplo, int N, const void *A, float *work, float *normA ); +void TCORE_clantr( int norm, int uplo, int diag, int M, int N, const void *A, float *work, float *normA ); +int TCORE_clascal( int uplo, int m, int n, void * alpha, void *A ); +void TCORE_claset( int uplo, int n1, int n2, void * alpha, void * beta, void *A ); +void TCORE_claset2( int uplo, int n1, int n2, void * alpha, void *A ); +int TCORE_clatro( int uplo, int trans, int M, int N, const void *A, void *B ); +void TCORE_clauum( int uplo, int N, void *A ); +void TCORE_cplghe( float bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_cplgsy( void * bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_cplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_cpotrf( int uplo, int n, void *A, int *INFO ); +int TCORE_cssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV ); +void TCORE_csymm( int side, int uplo, int M, int N, void * alpha, const void *A, const void *B, void * beta, void *C ); +void TCORE_csyrk( int uplo, int trans, int N, int K, void * alpha, const void *A, void * beta, void *C ); +void TCORE_csyr2k( int uplo, int trans, int N, int K, void * alpha, const void *A, const void *B, void * beta, void *C ); +int TCORE_csyssq( int storev, int uplo, int N, const void *A, void *sclssq ); +int TCORE_csytf2_nopiv( int uplo, int n, void *A ); +int TCORE_ctplqt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK ); +int TCORE_ctpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK ); +int TCORE_ctpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK ); +int TCORE_ctpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK ); +int TCORE_ctradd( int uplo, int trans, int M, int N, void * alpha, const void *A, void * beta, void *B ); +void TCORE_ctrasm( int storev, int uplo, int diag, int M, int N, const void *A, float *work ); +void TCORE_ctrmm( int side, int uplo, int transA, int diag, int M, int N, void * alpha, const void *A, void *B ); +void TCORE_ctrsm( int side, int uplo, int transA, int diag, int M, int N, void * alpha, const void *A, void *B ); +int TCORE_ctrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq ); +void TCORE_ctrtri( int uplo, int diag, int N, void *A, int *info ); +int TCORE_ctsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork ); +int TCORE_ctsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork ); +int TCORE_ctstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, void *WORK, int LDWORK, int *INFO ); +int TCORE_cunmlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, void *WORK, int LDWORK ); +int TCORE_cunmqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, void *WORK, int LDWORK ); +int TCORE_cgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A ); + +void TCORE_dzasum( int storev, int uplo, int M, int N, const void *A, double *work ); +int TCORE_zaxpy( int M, void *alpha, const void *A, int incA, void *B, int incB ); +int TCORE_zgeadd( int trans, int M, int N, void *alpha, const void *A, void *beta, void *B ); +int TCORE_zgelqt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK ); +void TCORE_zgemm( int transA, int transB, int M, int N, int K, void *alpha, const void *A, const void *B, void *beta, void *C ); +int TCORE_zgeqrt( int M, int N, int IB, void *A, void *T, void *TAU, void *WORK ); +int TCORE_zgessm( int M, int N, int K, int IB, const int *IPIV, const void *L, void *A ); +int TCORE_zgessq( int storev, int M, int N, const void *A, void *sclssq ); +int TCORE_zgetrf( int M, int N, void *A, int *IPIV, int *INFO ); +int TCORE_zgetrf_incpiv( int M, int N, int IB, void *A, int *IPIV, int *INFO ); +int TCORE_zgetrf_nopiv( int M, int N, int IB, void *A, int *INFO ); +void TCORE_zhe2ge( int uplo, int M, int N, const void *A, void *B ); +void TCORE_zhemm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, void *beta, void *C ); +void TCORE_zherk( int uplo, int trans, int N, int K, void *alpha, const void *A, double beta, void *C ); +void TCORE_zher2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, double beta, void *C ); +int TCORE_zherfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, void *WORK, int ldwork ); +int TCORE_zhessq( int storev, int uplo, int N, const void *A, void *sclssq ); +void TCORE_zlacpy( int uplo, int M, int N, const void *A, void *B ); +void TCORE_zlange( int norm, int M, int N, const void *A, double *work, double *normA ); +void TCORE_zlanhe( int norm, int uplo, int N, const void *A, double *work, double *normA ); +void TCORE_zlansy( int norm, int uplo, int N, const void *A, double *work, double *normA ); +void TCORE_zlantr( int norm, int uplo, int diag, int M, int N, const void *A, double *work, double *normA ); +int TCORE_zlascal( int uplo, int m, int n, void *alpha, void *A ); +void TCORE_zlaset( int uplo, int n1, int n2, void *alpha, void *beta, void *A ); +void TCORE_zlaset2( int uplo, int n1, int n2, void *alpha, void *A ); +int TCORE_zlatro( int uplo, int trans, int M, int N, const void *A, void *B ); +void TCORE_zlauum( int uplo, int N, void *A ); +void TCORE_zplghe( double bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_zplgsy( void *bump, int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_zplrnt( int m, int n, void *tileA, int bigM, int m0, int n0, unsigned long long int seed ); +void TCORE_zpotrf( int uplo, int n, void *A, int *INFO ); +int TCORE_zssssm( int M1, int N1, int M2, int N2, int K, int IB, void *A1, void *A2, const void *L1, const void *L2, const int *IPIV ); +void TCORE_zsymm( int side, int uplo, int M, int N, void *alpha, const void *A, const void *B, void *beta, void *C ); +void TCORE_zsyrk( int uplo, int trans, int N, int K, void *alpha, const void *A, void *beta, void *C ); +void TCORE_zsyr2k( int uplo, int trans, int N, int K, void *alpha, const void *A, const void *B, void *beta, void *C ); +int TCORE_zsyssq( int storev, int uplo, int N, const void *A, void *sclssq ); +int TCORE_zsytf2_nopiv( int uplo, int n, void *A ); +int TCORE_ztplqt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK ); +int TCORE_ztpmlqt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK ); +int TCORE_ztpmqrt( int side, int trans, int M, int N, int K, int L, int IB, const void *V, const void *T, void *A, void *B, void *WORK ); +int TCORE_ztpqrt( int M, int N, int L, int IB, void *A, void *B, void *T, void *WORK ); +int TCORE_ztradd( int uplo, int trans, int M, int N, void *alpha, const void *A, void *beta, void *B ); +void TCORE_ztrasm( int storev, int uplo, int diag, int M, int N, const void *A, double *work ); +void TCORE_ztrmm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +void TCORE_ztrsm( int side, int uplo, int transA, int diag, int M, int N, void *alpha, const void *A, void *B ); +int TCORE_ztrssq( int uplo, int diag, int M, int N, const void *A, void *sclssq ); +void TCORE_ztrtri( int uplo, int diag, int N, void *A, int *info ); +int TCORE_ztsmlq_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork ); +int TCORE_ztsmqr_hetra1( int side, int trans, int m1, int n1, int m2, int n2, int k, int ib, void *A1, void *A2, const void *V, const void *T, void *WORK, int ldwork ); +int TCORE_ztstrf( int M, int N, int IB, int NB, void *U, void *A, void *L, int *IPIV, void *WORK, int LDWORK, int *INFO ); +int TCORE_zunmlq( int side, int trans, int M, int N, int IB, int K, const void *V, const void *T, void *C, void *WORK, int LDWORK ); +int TCORE_zunmqr( int side, int trans, int M, int N, int K, int IB, const void *V, const void *T, void *C, void *WORK, int LDWORK ); +int TCORE_zgram( int uplo, int M, int N, int Mt, int Nt, const void *Di, const void *Dj, const void *D, void *A ); + +void CORE_slacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB); +void CORE_dlacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB); +void CORE_clacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB); +void CORE_zlacpy(int uplo, int M, int N, void *A, int LDA, void *B, int LDB); +int CORE_splssq( int storev, int M, int N, void *sclssqin, void *sclssqout ); +int CORE_dplssq( int storev, int M, int N, void *sclssqin, void *sclssqout ); +int CORE_cplssq( int storev, int M, int N, void *sclssqin, void *sclssqout ); +int CORE_zplssq( int storev, int M, int N, void *sclssqin, void *sclssqout ); +int CORE_splssq2( int N, void *sclssq ); +int CORE_dplssq2( int N, void *sclssq ); +int CORE_cplssq2( int N, void *sclssq ); +int CORE_zplssq2( int N, void *sclssq ); + +END_MODULE diff --git a/doc/orgmode/chapters/installing.org b/doc/orgmode/chapters/installing.org index 0193d3dad..7675856d6 100644 --- a/doc/orgmode/chapters/installing.org +++ b/doc/orgmode/chapters/installing.org @@ -61,6 +61,9 @@ we encourage users to use [[sec:spack][Spack]]. sudo apt-get install -y libopenmpi-dev # Install hwloc (used by StarPU or QUARK, already a dependency of OpenMPI) sudo apt-get install -y libhwloc-dev + # install EZTrace, usefull to export some nice execution traces + with all runtimes + sudo apt-get install -y libeztrace-dev # install FxT, usefull to export some nice execution traces with StarPU sudo apt-get install -y libfxt-dev # Install cuda and cuBLAS: only if you have a GPU cuda compatible @@ -209,17 +212,17 @@ we encourage users to use [[sec:spack][Spack]]. *Caution about the compatibility:* Chameleon has been mainly tested with the QUARK library coming from https://github.com/ecrc/quark. -**** FxT - [[http://download.savannah.gnu.org/releases/fkt/][FxT]] stands for both FKT (Fast Kernel Tracing) and FUT (Fast User - Tracing). This library provides efficient support for recording - traces. Chameleon can trace kernels execution on the different - workers and produce .paje files if FxT is enabled. FxT can only - be used through StarPU and StarPU must be compiled with FxT - enabled, see how to use this feature here [[sec:trace][Execution trace using - StarPU]]. - - *Caution about the compatibility:* FxT should be compatible with - the version of StarPU used. +**** EZTrace + This library provides efficient modules for recording + traces. Chameleon can trace kernels execution on CPU workers + thanks to EZTrace and produce .paje files. EZTrace also provides + integrated modules to trace MPI calls and/or memory usage. See + how to use this feature here [[sec:trace_ezt][Execution trace + using EZTrace]]. To trace kernels execution on all kind of + workers, such as CUDA, We recommend to use the internal tracing + support of the runtime system used done by the underlying + runtime. See how to use this feature here [[sec:trace_fxt][Execution trace + using StarPU/FxT]]. **** hwloc [[http://www.open-mpi.org/projects/hwloc/][hwloc]] (Portable Hardware Locality) is a software package for @@ -394,11 +397,6 @@ we encourage users to use [[sec:spack][Spack]]. * *CHAMELEON_ENABLE_TIMING=ON|OFF* (default ON): to control build of timing executables (performances check) contained in timing/ sub-directory - * *CHAMELEON_ENABLE_TRACING=ON|OFF* (default OFF): to enable trace - generation during execution of timing drivers. It requires - StarPU to be linked with FxT library (trace execution of - kernels on workers), see also [[sec:trace][Execution tracing - with StarPU]]. * *CHAMELEON_SIMULATION=ON|OFF* (default OFF): to enable simulation mode, means Chameleon will not really execute tasks, see details in section [[sec:simu][Use simulation mode with diff --git a/doc/orgmode/chapters/using.org b/doc/orgmode/chapters/using.org index c00695dc2..9798c5831 100644 --- a/doc/orgmode/chapters/using.org +++ b/doc/orgmode/chapters/using.org @@ -129,28 +129,91 @@ * GELS: solves overdetermined or underdetermined linear systems involving a general matrix using the QR or the LQ factorization * GESVD: general matrix singular value decomposition -*** Execution trace using StarPU - <<sec:trace>> +*** Execution trace using EZTrace + <<sec:trace_ezt>> + + [[http://eztrace.gforge.inria.fr/support.html][EZTrace]] can be used by chameleon to generate traces. Two modules + are automatically generated as soon as EZTrace is detected on the + system. The first one (which is recommended) is the + ~chameleon_tcore~ module. It traces all the ~TCORE_...()~ functions + that are called by the codelets of all the runtime but PaRSEC. The + second one is the ~chameleon_core~ module which traces the lower + level ~CORE_...()~ functions. If using PaRSEC, you need to use this + module to generate the traces. + + To generate traces with EZTrace, you need first to compile with + *-DBUILD_SHARED_LIBS=ON*. EZTrace is using weak symbols to overload + function calls with ld_preload and enable trace generation. Then, + either you install the ~libeztrace-*.so~ files into the EZTrace + install directory, or you can add the path of the modules to your + environement + #+begin_src + export EZTRACE_LIBRARY_PATH=/path/to/your/modules + #+end_src + + To check if the modules are available you should have + #+begin_src + $ eztrace_avail + 1 omp Module for OpenMP parallel regions + 2 pthread Module for PThread synchronization functions (mutex, semaphore, spinlock, etc.) + 3 stdio Module for stdio functions (read, write, select, poll, etc.) + 4 mpi Module for MPI functions + 5 memory Module for memory functions (malloc, free, etc.) + 6 papi Module for PAPI Performance counters + 128 chameleon_core Module for Chameleon CORE functions + 129 chameleon_tcore Module for Chameleon TCORE functions + #+end_src + + Then, you can restrict the modules used during the execution + #+begin_src + export EZTRACE_TRACE="mpi chameleon_tcore" + #+end_src + + _The module ~mpi~ is required if you want to run in distributed._ + + The setup can be checked with ~eztrace_loaded~ + #+begin_src + $ eztrace_loaded + 4 mpi Module for MPI functions + 129 chameleon_tcore Module for Chameleon TCORE functions + #+end_src + + To generate the traces, you need to run your binary through + eztrace: + #+begin_src + eztrace ./dnew-testing -o gemm -n 1000 -b 200 + mpirun -np 4 eztrace ./dnew-testing -o gemm -n 1000 -b 200 -P 2 + #+end_src + + Convert the binary files into a ~.trace~ file, and visualize it. + #+begin_src + eztrace_convert <username>_eztrace_log_rank_<[0-9]*> + vite eztrace_output.trace + #+end_src + + For more information on EZTrace, you can follow the [[http://eztrace.gforge.inria.fr/support.html][support page]]. + +*** Execution trace using StarPU/FxT + <<sec:trace_fxt>> StarPU can generate its own trace log files by compiling it with the ~--with-fxt~ option at the configure step (you can have to specify the directory where you installed FxT by giving ~--with-fxt=...~ instead of ~--with-fxt~ alone). By doing so, traces are generated after each execution of a program which uses StarPU - in the directory pointed by the STARPU_FXT_PREFIX environment + in the directory pointed by the [[http://starpu.gforge.inria.fr/doc/html/ExecutionConfigurationThroughEnvironmentVariables.html][STARPU_FXT_PREFIX]] environment variable. #+begin_example export STARPU_FXT_PREFIX=/home/jdoe/fxt_files/ #+end_example When executing a ~./timing/...~ Chameleon program, if it has been - enabled (StarPU compiled with FxT and - *-DCHAMELEON_ENABLE_TRACING=ON*), you can give the option ~--trace~ to - tell the program to generate trace log files. + enabled (StarPU compiled with FxT), the program will generate + trace files in the directory $STARPU_FXT_PREFIX. Finally, to generate the trace file which can be opened with [[http://vite.gforge.inria.fr/][Vite]] program, you can use the *starpu_fxt_tool* executable of StarPU. - This tool should be in ~$STARPU_INSTALL_REPOSITORY/bin~. You can - use it to generate the trace file like this: + This tool should be in the bin directory of StarPU's installation. + You can use it to generate the trace file like this: #+begin_src path/to/your/install/starpu/bin/starpu_fxt_tool -i prof_filename #+end_src diff --git a/example/link_chameleon/CMakeLists.txt b/example/link_chameleon/CMakeLists.txt index 6e0dac9e9..89a16531d 100644 --- a/example/link_chameleon/CMakeLists.txt +++ b/example/link_chameleon/CMakeLists.txt @@ -49,7 +49,7 @@ if (CHAMELEON_DISTRIB_DIR) if (CHAMELEON_CHAMELEON_USE_QUARK) find_package(CHAMELEON COMPONENTS QUARK) else() - find_package(CHAMELEON COMPONENTS STARPU MPI CUDA FXT) + find_package(CHAMELEON COMPONENTS STARPU MPI CUDA) endif() if (CHAMELEON_FOUND) link_directories(${CHAMELEON_LIBRARY_DIRS_DEP}) diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in index 1596315d8..8432d7a93 100644 --- a/include/chameleon/config.h.in +++ b/include/chameleon/config.h.in @@ -47,9 +47,6 @@ /* Simulation */ #cmakedefine CHAMELEON_SIMULATION -/* Tracing support */ -#cmakedefine CHAMELEON_ENABLE_TRACING - /* getopt */ #cmakedefine CHAMELEON_HAVE_GETOPT_H #cmakedefine CHAMELEON_HAVE_GETOPT_LONG diff --git a/new-testing/testing_zauxiliary.c b/new-testing/testing_zauxiliary.c index da33718fa..16da205d6 100644 --- a/new-testing/testing_zauxiliary.c +++ b/new-testing/testing_zauxiliary.c @@ -30,13 +30,16 @@ struct option; * @brief Defines all the parameters of the testings */ static parameter_t parameters[] = { + /* Name, helper, shname, flags, has_arg, psize, valtype, value, vallist, read, sprint */ { "id", "Id of the run", 0, PARAM_OUTPUT, 0, 3, TestValInt, {0}, NULL, NULL, sprint_int }, { NULL, "Options", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, - { "help", "Show this help", 'h', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, - { "check", "Enable checking of the result", 'c', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, - { "human", "Enable human readable mode", 'H', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, - { "niter", "Perform multiple iteration per test", 'l', PARAM_OPTION, 1, 0, TestValInt, {1}, NULL, pread_int, sprint_int }, + { "help", "Show this help", 'h', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "check", "Enable checking of the result", 'c', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "human", "Enable human readable mode", 'H', PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "niter", "Perform multiple iteration per test", 'l', PARAM_OPTION, 1, 0, TestValInt, {1}, NULL, pread_int, sprint_int }, + { "trace", "Enable the trace generation", -30, PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, + { "nowarmup", "Disable the warmup run to load libraries", -31, PARAM_OPTION, 0, 0, TestValInt, {0}, NULL, pread_int, sprint_int }, { NULL, "Machine parameters", 0, PARAM_OPTION, 0, 0, 0, {0}, NULL, NULL, NULL }, { "threads", "Number of CPU workers per node", 't', PARAM_OPTION | PARAM_OUTPUT, 1, 7, TestValInt, {1}, NULL, pread_int, sprint_int }, @@ -476,6 +479,7 @@ parameters_destroy() int main (int argc, char **argv) { int ncores, ngpus, human, check, i, niter; + int trace, nowarmup; int rc, info = 0; int run_id = 0; char *func_name; @@ -491,12 +495,14 @@ int main (int argc, char **argv) { parameters_read_file( input_file ); free(input_file); } - ncores = parameters_getvalue_int( "threads" ); - ngpus = parameters_getvalue_int( "gpus" ); - check = parameters_getvalue_int( "check" ); - human = parameters_getvalue_int( "human" ); - func_name = parameters_getvalue_str( "op" ); - niter = parameters_getvalue_int( "niter" ); + ncores = parameters_getvalue_int( "threads" ); + ngpus = parameters_getvalue_int( "gpus" ); + check = parameters_getvalue_int( "check" ); + human = parameters_getvalue_int( "human" ); + func_name = parameters_getvalue_str( "op" ); + niter = parameters_getvalue_int( "niter" ); + trace = parameters_getvalue_int( "trace" ); + nowarmup = parameters_getvalue_int( "nowarmup" ); CHAMELEON_Init( ncores, ngpus ); @@ -510,6 +516,20 @@ int main (int argc, char **argv) { /* Executes the tests */ run_print_header( test, check, human ); run = runlist->head; + + /* Warmup */ + if ( !nowarmup ) { + run_arg_list_t copy = run_arg_list_copy( &(run->args) ); + rc = test->fptr( ©, check ); + run_arg_list_destroy( © ); + } + + /* Start tracing */ + if ( trace ) { + CHAMELEON_Enable( CHAMELEON_PROFILING_MODE ); + } + + /* Perform all runs */ while ( run != NULL ) { for(i=0; i<niter; i++) { run_arg_list_t copy = run_arg_list_copy( &(run->args) ); @@ -530,6 +550,12 @@ int main (int argc, char **argv) { run_list_destroy( run ); run = next; } + + /* Stop tracing */ + if ( trace ) { + CHAMELEON_Disable( CHAMELEON_PROFILING_MODE ); + } + free( runlist ); CHAMELEON_Finalize(); diff --git a/new-testing/testing_zauxiliary.h b/new-testing/testing_zauxiliary.h index 2be0d88c5..7b35505f8 100644 --- a/new-testing/testing_zauxiliary.h +++ b/new-testing/testing_zauxiliary.h @@ -23,47 +23,6 @@ #include "testings.h" -/** - * - * Macro for trace generation - * - */ -#define START_TRACING() \ - RUNTIME_start_stats(); \ - if(iparam[IPARAM_TRACE] == 2) { \ - RUNTIME_start_profiling(); \ - } \ - if(iparam[IPARAM_BOUND]) { \ - CHAMELEON_Enable(CHAMELEON_BOUND); \ - } - -#define STOP_TRACING() \ - RUNTIME_stop_stats(); \ - if(iparam[IPARAM_TRACE] == 2) { \ - RUNTIME_stop_profiling(); \ - } \ - if(iparam[IPARAM_BOUND]) { \ - CHAMELEON_Disable(CHAMELEON_BOUND); \ - } - -/** - * - * Macro for DAG generation - * - */ -#if 0 -#define START_DAG() \ - if ( iparam[IPARAM_DAG] == 2 ) \ - CHAMELEON_Enable(CHAMELEON_DAG); - -#define STOP_DAG() \ - if ( iparam[IPARAM_DAG] == 2 ) \ - CHAMELEON_Disable(CHAMELEON_DAG); -#else -#define START_DAG() do {} while(0); -#define STOP_DAG() do {} while(0); -#endif - /** * * Synchro for distributed computations @@ -82,23 +41,6 @@ * General Macros for timing * */ -/* #define START_TIMING() \ */ -/* START_DAG(); \ */ -/* START_TRACING(); \ */ -/* START_DISTRIBUTED(); \ */ -/* t = -RUNTIME_get_time(); */ - -/* #define STOP_TIMING() \ */ -/* STOP_DISTRIBUTED(); \ */ -/* t += RUNTIME_get_time(); \ */ -/* STOP_TRACING(); \ */ -/* STOP_DAG(); \ */ -/* if (iparam[IPARAM_PROFILE] == 2) { \ */ -/* RUNTIME_kernelprofile_display(); \ */ -/* RUNTIME_schedprofile_display(); \ */ -/* } \ */ -/* *t_ = t; */ - #define START_TIMING( _t_ ) \ START_DISTRIBUTED(); \ (_t_) = RUNTIME_get_time(); diff --git a/new-testing/testing_zgetrf.c b/new-testing/testing_zgetrf.c index 68491ba58..ebfea504e 100644 --- a/new-testing/testing_zgetrf.c +++ b/new-testing/testing_zgetrf.c @@ -55,8 +55,6 @@ testing_zgetrf( run_arg_list_t *args, int check ) run_arg_add_fixdbl( args, "time", t ); run_arg_add_fixdbl( args, "gflops", ( hres == CHAMELEON_SUCCESS ) ? gflops : -1. ); - fprintf( stdout, "hres = %d\n", hres ); - /* Checks the factorisation and residue */ if ( check ) { CHAM_desc_t *descA0 = CHAMELEON_Desc_Copy( descA, NULL ); diff --git a/runtime/openmp/control/runtime_profiling.c b/runtime/openmp/control/runtime_profiling.c index dfb9e2579..b243c9d4d 100644 --- a/runtime/openmp/control/runtime_profiling.c +++ b/runtime/openmp/control/runtime_profiling.c @@ -27,12 +27,10 @@ double RUNTIME_get_time(){ void RUNTIME_start_profiling() { - chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with OpenMP\n"); } void RUNTIME_stop_profiling() { - chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with OpenMP\n"); } void RUNTIME_start_stats() diff --git a/runtime/parsec/control/runtime_profiling.c b/runtime/parsec/control/runtime_profiling.c index 870233220..c8f970005 100644 --- a/runtime/parsec/control/runtime_profiling.c +++ b/runtime/parsec/control/runtime_profiling.c @@ -26,12 +26,10 @@ double RUNTIME_get_time(){ void RUNTIME_start_profiling() { - chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with PaRSEC\n"); } void RUNTIME_stop_profiling() { - chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with PaRSEC\n"); } void RUNTIME_start_stats() diff --git a/runtime/quark/control/runtime_profiling.c b/runtime/quark/control/runtime_profiling.c index 685a81c40..10da952b0 100644 --- a/runtime/quark/control/runtime_profiling.c +++ b/runtime/quark/control/runtime_profiling.c @@ -26,12 +26,10 @@ double RUNTIME_get_time(){ void RUNTIME_start_profiling() { - chameleon_warning("RUNTIME_start_profiling()", "FxT profiling is not available with Quark\n"); } void RUNTIME_stop_profiling() { - chameleon_warning("RUNTIME_stop_profiling()", "FxT profiling is not available with Quark\n"); } void RUNTIME_start_stats() diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index 14659f785..a0f05c045 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -21,7 +21,7 @@ #include <stdio.h> #include <stdlib.h> #include "chameleon_starpu.h" -#if defined(HAVE_STARPU_FXT_PROFILING) +#if defined(STARPU_USE_FXT) #include <starpu_fxt.h> #endif @@ -40,7 +40,7 @@ static int chameleon_starpu_init( starpu_conf_t *conf ) MPI_Initialized( &flag ); # endif -#if defined(HAVE_STARPU_FXT_PROFILING) +#if defined(STARPU_USE_FXT) starpu_fxt_autostart_profiling(0); #endif diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c index 78d2394ef..3eb14c216 100644 --- a/runtime/starpu/control/runtime_profiling.c +++ b/runtime/starpu/control/runtime_profiling.c @@ -20,7 +20,7 @@ */ #include <math.h> #include "chameleon_starpu.h" -#if defined(HAVE_STARPU_FXT_PROFILING) +#if defined(STARPU_USE_FXT) #include <starpu_fxt.h> #endif @@ -55,18 +55,14 @@ void RUNTIME_iteration_pop( CHAM_context_t *chamctxt ) } void RUNTIME_start_profiling(){ -#if defined(HAVE_STARPU_FXT_PROFILING) +#if defined(STARPU_USE_FXT) starpu_fxt_start_profiling(); -#else - fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n"); #endif } void RUNTIME_stop_profiling(){ -#if defined(HAVE_STARPU_FXT_PROFILING) +#if defined(STARPU_USE_FXT) starpu_fxt_stop_profiling(); -#else - fprintf(stderr, "Profiling throught FxT has not been enabled in StarPU runtime (configure StarPU with --with-fxt)\n"); #endif } diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 6cf49261c..223e5d045 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -24,7 +24,6 @@ #include "chameleon/config.h" /* StarPU options */ -#cmakedefine HAVE_STARPU_FXT_PROFILING #cmakedefine HAVE_STARPU_IDLE_PREFETCH #cmakedefine HAVE_STARPU_ITERATION_PUSH #cmakedefine HAVE_STARPU_DATA_WONT_USE -- GitLab