diff --git a/control/context.c b/control/context.c index f1d42c150871116a88f5658cd2c23a87694805fd..d724c82ad9eb61966a78e49756b79138de72b724 100644 --- a/control/context.c +++ b/control/context.c @@ -230,9 +230,9 @@ int CHAMELEON_Enable(int option) chameleon_error("CHAMELEON_Enable", "cannot enable GEMM3M (not available in cblas)"); #endif break; - /* case CHAMELEON_PARALLEL: */ - /* chamctxt->parallel_enabled = CHAMELEON_TRUE; */ - /* break; */ + case CHAMELEON_PARALLEL_KERNEL: + chamctxt->parallel_enabled = CHAMELEON_TRUE; + break; case CHAMELEON_GENERIC: chamctxt->generic_enabled = CHAMELEON_TRUE; break; @@ -302,7 +302,7 @@ int CHAMELEON_Disable(int option) set_coreblas_gemm3m_enabled(0); #endif break; - case CHAMELEON_PARALLEL_MODE: + case CHAMELEON_PARALLEL_KERNEL: chamctxt->parallel_enabled = CHAMELEON_FALSE; break; case CHAMELEON_GENERIC: diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 573706f8d8f3eadb9941006573fe9130bdad806e..6462ddd4a5142db2b9a457d14ffe1d3f1c41bfef 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -219,7 +219,7 @@ typedef enum chameleon_gemm_e { #define CHAMELEON_PROFILING_MODE CHAMELEON_GENERATE_TRACE /* _deprecated_ */ #define CHAMELEON_GENERATE_STATS 6 #define CHAMELEON_KERNELPROFILE_MODE CHAMELEON_GENERATE_STATS /* _deprecated_ */ -#define CHAMELEON_PARALLEL_MODE 7 +#define CHAMELEON_PARALLEL_KERNEL 7 #define CHAMELEON_BOUND 8 #define CHAMELEON_PROGRESS 9 #define CHAMELEON_GEMM3M 10 diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index b63ea2f7c108054181e6e3d18a4d20ae9b15cb66..2b4d0a1db651921f94fe509cd5976800e69c162a 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -79,6 +79,10 @@ if ( STARPU_FOUND ) if ( HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS ) message("-- ${Blue}Add definition HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS${ColourReset}") endif() + check_function_exists(starpu_parallel_worker_init HAVE_STARPU_PARALLEL_WORKER) + if ( HAVE_STARPU_PARALLEL_WORKER ) + message("-- ${Blue}Add definition HAVE_STARPU_PARALLEL_WORKER${ColourReset}") + endif() check_struct_has_member( "struct starpu_data_interface_ops" reuse_data_on_node "starpu_data_interfaces.h" HAVE_STARPU_REUSE_DATA_ON_NODE LANGUAGE "C" ) if ( HAVE_STARPU_REUSE_DATA_ON_NODE ) message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}") diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 357a730efa27b07ea92d3a1eaede9918cd70ef9a..bc972bcf54562388b6c091049c21d972b79cadf9 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -255,6 +255,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 86afc9359a8f6bcbb219bea5021ffee5f180410d..ee2b3320fd9553ca4d258c4f9a06f4f9b91b3508 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -111,6 +111,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index fc64665bcefe751c29e21c75d53f125c1fab1173..2c7ac3e923d909452c3c524576d82fdbeff09061 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -144,6 +144,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 330076a5800bda4d03d25b4dca20a4fb0656cce0..0196649684a4652ff5686bc060aad7e7879a1f2b 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -136,6 +136,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c index ba62e22f72dfb7cf7e33358c9492a64a75faab00..c83ab1283339f214c91c61f7643d75874e67658e 100644 --- a/runtime/starpu/control/runtime_context.c +++ b/runtime/starpu/control/runtime_context.c @@ -42,15 +42,15 @@ int _starpu_is_initialized(void); */ void RUNTIME_context_create( CHAM_context_t *chamctxt ) { - starpu_conf_t *conf; - chamctxt->scheduler = RUNTIME_SCHED_STARPU; - if (! starpu_is_initialized() ) { - chamctxt->schedopt = (void*) malloc (sizeof(struct starpu_conf)); - conf = chamctxt->schedopt; + if ( !starpu_is_initialized() ) { + starpu_sched_opt_t *sched_opt = malloc( sizeof(starpu_sched_opt_t) ); + + sched_opt->pw_config = NULL; + starpu_conf_init( &(sched_opt->starpu_conf) ); - starpu_conf_init( conf ); + chamctxt->schedopt = sched_opt; } else { chamctxt->schedopt = NULL; @@ -65,8 +65,8 @@ void RUNTIME_context_create( CHAM_context_t *chamctxt ) void RUNTIME_context_destroy( CHAM_context_t *chamctxt ) { /* StarPU was already initialized by an external library */ - if (chamctxt->schedopt) { - free(chamctxt->schedopt); + if ( chamctxt->schedopt ) { + free( chamctxt->schedopt ); } return; } diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index 00451e415ff3e1635719fbe20a448ea4d8f933c1..deeafd9e882e5955625df212645ea2db772272ab 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -31,10 +31,71 @@ static int starpu_initialized = 0; +#if defined(STARPU_HAVE_HWLOC) && defined(HAVE_STARPU_PARALLEL_WORKER) +void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt ) +{ + char *env_pw_level = chameleon_getenv( "CHAMELEON_PARALLEL_WORKER_LEVEL" ); + + if (env_pw_level != NULL) { + struct starpu_parallel_worker_config *pw_config = NULL; + + hwloc_obj_type_t pw_level; + int pw_level_number = 1; + char level[256]; + + int argc = strchr( env_pw_level, ':') == NULL ? 1 : 2; + int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number ); + + if ( (match != argc) || + ((match == 2) && (pw_level_number < 0) ) ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\" does not match the format level[:number] where number > 0.\n", env_pw_level ); + exit(1); + } + + if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\" does not match an hwloc level.\n", level ); + exit(1); + } + + pw_config = starpu_parallel_worker_init( pw_level, + STARPU_PARALLEL_WORKER_NB, pw_level_number, + STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_GNU_OPENMP_MKL, + 0 ); + + if ( pw_config == NULL ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p + exit(1); + } + + if ( chameleon_env_on_off( "CHAMELEON_PARALLEL_WORKER_SHOW", CHAMELEON_FALSE ) == CHAMELEON_TRUE ) { + starpu_parallel_worker_print( pw_config ); + } + + sched_opt->pw_config = pw_config; + } + + chameleon_cleanenv( env_pw_level ); +} + +void chameleon_starpu_parallel_worker_fini( starpu_sched_opt_t *sched_opt ) +{ + if ( sched_opt->pw_config != NULL ) { + starpu_parallel_worker_shutdown( sched_opt->pw_config ); + sched_opt->pw_config = NULL; + } +} +#else +#define chameleon_starpu_parallel_worker_init(sched_opt) do { (void) sched_opt; } while(0) +#define chameleon_starpu_parallel_worker_fini(sched_opt) do { (void) sched_opt; } while(0) +#endif + /** * */ -static int chameleon_starpu_init( starpu_conf_t *conf ) +static int chameleon_starpu_init( struct starpu_conf *conf ) { int hres = CHAMELEON_SUCCESS; int rc; @@ -83,7 +144,8 @@ int RUNTIME_init( CHAM_context_t *chamctxt, int ncudas, int nthreads_per_worker ) { - starpu_conf_t *conf = (starpu_conf_t*)(chamctxt->schedopt); + starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt); + struct starpu_conf *conf = &sched_opt->starpu_conf; int hres = CHAMELEON_ERR_NOT_INITIALIZED; /* StarPU was already initialized by an external library */ @@ -119,8 +181,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt, if ((ncpus == -1)||(nthreads_per_worker == -1)) { - chamctxt->parallel_enabled = CHAMELEON_FALSE; - hres = chameleon_starpu_init( conf ); chamctxt->nworkers = ncpus; @@ -129,8 +189,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt, else { int worker; - chamctxt->parallel_enabled = CHAMELEON_TRUE; - for (worker = 0; worker < ncpus; worker++) conf->workers_bindid[worker] = (worker+1)*nthreads_per_worker - 1; @@ -152,11 +210,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt, starpu_initialized = 1; #ifdef HAVE_STARPU_MALLOC_ON_NODE_SET_DEFAULT_FLAGS - starpu_malloc_on_node_set_default_flags(STARPU_MAIN_RAM, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT + starpu_malloc_on_node_set_default_flags( STARPU_MAIN_RAM, + STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT #ifdef STARPU_MALLOC_SIMULATION_FOLDED - | STARPU_MALLOC_SIMULATION_FOLDED + | STARPU_MALLOC_SIMULATION_FOLDED #endif - ); + ); #endif #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) @@ -165,6 +224,7 @@ int RUNTIME_init( CHAM_context_t *chamctxt, starpu_cham_tile_interface_init(); + chameleon_starpu_parallel_worker_init( sched_opt ); return hres; } @@ -178,6 +238,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt ) return; } + starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt); + chameleon_starpu_parallel_worker_fini( sched_opt ); + starpu_cham_tile_interface_fini(); #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 761930adebb971bfe1f9bf8d25d16cd81521e4bc..e2d964ccc98ae105c4bd79ceac832f77c8ea89e2 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -36,6 +36,8 @@ #cmakedefine HAVE_STARPU_DATA_PEEK #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE +#cmakedefine HAVE_STARPU_PARALLEL_WORKER + #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER #cmakedefine HAVE_STARPU_MPI_COMM_RANK @@ -86,7 +88,11 @@ #include "runtime_workspace.h" #include "cham_tile_interface.h" -typedef struct starpu_conf starpu_conf_t; +typedef struct starpu_schedopt_s +{ + struct starpu_conf starpu_conf; /**< StarPU main configuration structure */ + struct starpu_parallel_worker_config *pw_config; /**< StarPU parallel workers configuration */ +} starpu_sched_opt_t; /* Structure used to give some options during one request (procedure) */ typedef struct starpu_option_request_s {