diff --git a/control/context.c b/control/context.c index f1d42c150871116a88f5658cd2c23a87694805fd..d724c82ad9eb61966a78e49756b79138de72b724 100644 --- a/control/context.c +++ b/control/context.c @@ -230,9 +230,9 @@ int CHAMELEON_Enable(int option) chameleon_error("CHAMELEON_Enable", "cannot enable GEMM3M (not available in cblas)"); #endif break; - /* case CHAMELEON_PARALLEL: */ - /* chamctxt->parallel_enabled = CHAMELEON_TRUE; */ - /* break; */ + case CHAMELEON_PARALLEL_KERNEL: + chamctxt->parallel_enabled = CHAMELEON_TRUE; + break; case CHAMELEON_GENERIC: chamctxt->generic_enabled = CHAMELEON_TRUE; break; @@ -302,7 +302,7 @@ int CHAMELEON_Disable(int option) set_coreblas_gemm3m_enabled(0); #endif break; - case CHAMELEON_PARALLEL_MODE: + case CHAMELEON_PARALLEL_KERNEL: chamctxt->parallel_enabled = CHAMELEON_FALSE; break; case CHAMELEON_GENERIC: diff --git a/doc/user/CMakeLists.txt b/doc/user/CMakeLists.txt index 776a06dff2d128e72d164578d55b35fa8eb6eb3a..2e60e1c2be040139a0b22e6d00343290cec38707 100644 --- a/doc/user/CMakeLists.txt +++ b/doc/user/CMakeLists.txt @@ -84,6 +84,7 @@ if(EMACS_COMPILER) ${CMAKE_CURRENT_SOURCE_DIR}/chapters/introduction.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/installing.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/using.org + ${CMAKE_CURRENT_SOURCE_DIR}/chapters/parallel_worker.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/performances.org ${CMAKE_CURRENT_BINARY_DIR}/CONTRIBUTING.org ${CMAKE_CURRENT_BINARY_DIR}/publish.el diff --git a/doc/user/chapters/lstopo-sirocco24.png b/doc/user/chapters/lstopo-sirocco24.png new file mode 100644 index 0000000000000000000000000000000000000000..d1718aa80810d93326b7e39e5099c66aab7b3166 Binary files /dev/null and b/doc/user/chapters/lstopo-sirocco24.png differ diff --git a/doc/user/chapters/parallel_worker.org b/doc/user/chapters/parallel_worker.org new file mode 100644 index 0000000000000000000000000000000000000000..6023210712983792ffa1cdd53c2a9f7b2396504e --- /dev/null +++ b/doc/user/chapters/parallel_worker.org @@ -0,0 +1,199 @@ +*** Using the =CHAMELEON_PARALLEL_WORKER= interface. + :PROPERTIES: + :CUSTOM_ID: interface-chameleon_parallel_worker + :END: + +The =CHAMELEON_PARALLEL_WORKER= interface is a extension only +available with the StarPU runtime system that allows to run +concurrently multi-threaded kernels. + +A StarPU parallel worker, previously called a cluster, is a set of +workers which execute a single parallel task (see [[https://files.inria.fr/starpu/doc/html/ClusteringAMachine.html][StarPU Documentation]]). + +To use this functionnality: +- StarPU must be compiled with the configure option =--enable-parallel-worker= +- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it. + +Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it. + +**** Environment variables to configure the parallel workers + :PROPERTIES: + :CUSTOM_ID: environment-variables + :END: +- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]= + : Specify the number of parallel workers per hardware-level - the + default value is 1. Note that hardware-level must correspond to an + hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=, + =MACHINE=. +- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers + contents is displayed. + +**** Limitations + :PROPERTIES: + :CUSTOM_ID: limitations + :END: + +For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers. + + +**** Examples + +In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind +the main thread of StarPU to a dedicated CPU to a reserved CPU, +subtracted from the CPU workers. This avoids using a whole parallel +worker to make the submission. + +The machine has 64 CPUs. One is dedicated to the task submission, Two +CPUs are dedicated to run the GPUs. + +#+caption: lstopo-sirocco24 +[[file:lstopo-sirocco24.png]] + +***** Example 1: Define a parallel worker per L3 cache (sirocco24) + :PROPERTIES: + :CUSTOM_ID: example-define-a-parallel-worker-per-l3-cache-sirocco24 + :END: +- Here we ask StarPU to create 1 parallel worker per L3 cache. The last + parallel worker does not have all the CPUs of the last L3 cache, as + there are 3 dedicated CPUs. + +#+begin_example +$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2 +Number of parallel workers created: 8 +Parallel worker 0 contains the following logical indexes: + 0 1 2 3 4 5 6 7 +Parallel worker 1 contains the following logical indexes: + 8 9 10 11 12 13 14 15 +Parallel worker 2 contains the following logical indexes: + 16 17 18 19 20 21 22 23 +Parallel worker 3 contains the following logical indexes: + 24 25 26 27 28 29 30 31 +Parallel worker 4 contains the following logical indexes: + 32 33 34 35 36 37 38 39 +Parallel worker 5 contains the following logical indexes: + 40 41 42 43 44 45 46 47 +Parallel worker 6 contains the following logical indexes: + 48 49 50 51 52 53 54 55 +Parallel worker 7 contains the following logical indexes: + 56 57 58 59 60 +Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops +0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.282047e+00;2.141577e+04 +1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;3.404408e+00;2.064605e+04 +2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;3.427721e+00;2.050563e+04 +3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;3.707147e+00;1.896001e+04 +#+end_example + +***** Example 2: Define 2 parallel workers per socket (sirocco24) +- Here we ask StarPU to create 2 parallel workers per socket. This ends + up with having the workers 45 and 46 in different parallel workers + even though they share the same L3 cache. + :PROPERTIES: + :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24 + :END: + +#+begin_example +$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2 +Number of parallel workers created: 4 +Parallel worker 0 contains the following logical indexes: + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +Parallel worker 1 contains the following logical indexes: + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +Parallel worker 2 contains the following logical indexes: + 32 33 34 35 36 37 38 39 40 41 42 43 44 45 +Parallel worker 3 contains the following logical indexes: + 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 +Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops +0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.256134e+00;2.158620e+04 +1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;7.003285e+00;1.003637e+04 +2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;8.816605e+00;7.972179e+03 +3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;1.064581e+01;6.602370e+03 +#+end_example + +**** How-to for the plafrim users + :PROPERTIES: + :CUSTOM_ID: downloading + :END: +#+begin_example +# Root directory +PTCHAMELEON=~/PTCHAMELEON +mkdir $PTCHAMELEON +cd $PTCHAMELEON +git clone git@gitlab.inria.fr:starpu/starpu.git +git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git +#+end_example + +***** Setup on sirocco16 (2 cpu intel + 2 v100) + :PROPERTIES: + :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100 + :END: +#+begin_example +module load build/cmake/3.15.3 \ + linalg/mkl/2022.0.2 \ + trace/eztrace/1.1-8 \ + hardware/hwloc/2.7.0 \ + compiler/gcc/11.2.0 \ + compiler/cuda/11.6 \ + mpi/openmpi/4.0.2 \ + trace/fxt/0.3.14 \ + trace/eztrace/1.1-9 \ + language/python + +# Build StarPU +cd $PTCHAMELEON/starpu +./autogen.sh +mkdir build && cd build +# In case you want to debug take the first line +#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt +../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install +make -j install +source $PTCHAMELEON/starpu/build/install/bin/starpu_env + +# Build Chameleon +cd $PTCHAMELEON/chameleon +mkdir build && cd build +cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON +make -j + +# test +STARPU_SILENT=1 +STARPU_SCHED=dmdar +CHAMELEON_PARALLEL_WORKER_LEVEL=L3 +CHAMELEON_PARALLEL_WORKER_SHOW=1 +STARPU_MAIN_THREAD_BIND=1 +STARPU_CUDA_PIPELINE=2 +STARPU_NWORKER_PER_CUDA=4 +STARPU_CALIBRATE=1 +$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2 +#+end_example + +***** Setup on sirocco24 (2 cpu amd + 2 a100) + :PROPERTIES: + :CUSTOM_ID: installation-sur-sirocco24-2-cpu-amd-2-a100 + :END: +Identical to sirocco16 except for the Intel MKL library: + +#+begin_example +module load build/cmake/3.15.3 \ + linalg/mkl/2020_update4 \ + trace/eztrace/1.1-8 \ + hardware/hwloc/2.7.0 \ + compiler/gcc/11.2.0 \ + compiler/cuda/11.6 \ + mpi/openmpi/4.0.2 \ + trace/fxt/0.3.14 \ + trace/eztrace/1.1-9 \ + language/python +#+end_example + +Strangely the execution requires the creation of links: + +#+begin_example +cd $PTCHAMELEON +for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ; +do + ln -s /cm/shared/modules/amd/rome/compiler/intel/2020_update4/mkl/lib/intel64/$lib $lib.2 +done + +LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH +#+end_example + diff --git a/doc/user/users_guide.org.in b/doc/user/users_guide.org.in index 59436009411cdcf6b6900c54ff9ec64c5a59e702..55bea351062a3841aa38dfc1889aad41983556bf 100644 --- a/doc/user/users_guide.org.in +++ b/doc/user/users_guide.org.in @@ -70,3 +70,4 @@ :END: <<sec:ug:using>> #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org + #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/parallel_worker.org diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index 573706f8d8f3eadb9941006573fe9130bdad806e..6462ddd4a5142db2b9a457d14ffe1d3f1c41bfef 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -219,7 +219,7 @@ typedef enum chameleon_gemm_e { #define CHAMELEON_PROFILING_MODE CHAMELEON_GENERATE_TRACE /* _deprecated_ */ #define CHAMELEON_GENERATE_STATS 6 #define CHAMELEON_KERNELPROFILE_MODE CHAMELEON_GENERATE_STATS /* _deprecated_ */ -#define CHAMELEON_PARALLEL_MODE 7 +#define CHAMELEON_PARALLEL_KERNEL 7 #define CHAMELEON_BOUND 8 #define CHAMELEON_PROGRESS 9 #define CHAMELEON_GEMM3M 10 diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index b63ea2f7c108054181e6e3d18a4d20ae9b15cb66..2b4d0a1db651921f94fe509cd5976800e69c162a 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -79,6 +79,10 @@ if ( STARPU_FOUND ) if ( HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS ) message("-- ${Blue}Add definition HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS${ColourReset}") endif() + check_function_exists(starpu_parallel_worker_init HAVE_STARPU_PARALLEL_WORKER) + if ( HAVE_STARPU_PARALLEL_WORKER ) + message("-- ${Blue}Add definition HAVE_STARPU_PARALLEL_WORKER${ColourReset}") + endif() check_struct_has_member( "struct starpu_data_interface_ops" reuse_data_on_node "starpu_data_interfaces.h" HAVE_STARPU_REUSE_DATA_ON_NODE LANGUAGE "C" ) if ( HAVE_STARPU_REUSE_DATA_ON_NODE ) message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}") diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 357a730efa27b07ea92d3a1eaede9918cd70ef9a..bc972bcf54562388b6c091049c21d972b79cadf9 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -255,6 +255,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 86afc9359a8f6bcbb219bea5021ffee5f180410d..ee2b3320fd9553ca4d258c4f9a06f4f9b91b3508 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -111,6 +111,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index fc64665bcefe751c29e21c75d53f125c1fab1173..2c7ac3e923d909452c3c524576d82fdbeff09061 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -144,6 +144,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 330076a5800bda4d03d25b4dca20a4fb0656cce0..0196649684a4652ff5686bc060aad7e7879a1f2b 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -136,6 +136,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, + STARPU_POSSIBLY_PARALLEL, options->parallel, #if defined(CHAMELEON_CODELETS_HAVE_NAME) STARPU_NAME, cl_name, #endif diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c index ba62e22f72dfb7cf7e33358c9492a64a75faab00..c83ab1283339f214c91c61f7643d75874e67658e 100644 --- a/runtime/starpu/control/runtime_context.c +++ b/runtime/starpu/control/runtime_context.c @@ -42,15 +42,15 @@ int _starpu_is_initialized(void); */ void RUNTIME_context_create( CHAM_context_t *chamctxt ) { - starpu_conf_t *conf; - chamctxt->scheduler = RUNTIME_SCHED_STARPU; - if (! starpu_is_initialized() ) { - chamctxt->schedopt = (void*) malloc (sizeof(struct starpu_conf)); - conf = chamctxt->schedopt; + if ( !starpu_is_initialized() ) { + starpu_sched_opt_t *sched_opt = malloc( sizeof(starpu_sched_opt_t) ); + + sched_opt->pw_config = NULL; + starpu_conf_init( &(sched_opt->starpu_conf) ); - starpu_conf_init( conf ); + chamctxt->schedopt = sched_opt; } else { chamctxt->schedopt = NULL; @@ -65,8 +65,8 @@ void RUNTIME_context_create( CHAM_context_t *chamctxt ) void RUNTIME_context_destroy( CHAM_context_t *chamctxt ) { /* StarPU was already initialized by an external library */ - if (chamctxt->schedopt) { - free(chamctxt->schedopt); + if ( chamctxt->schedopt ) { + free( chamctxt->schedopt ); } return; } diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index 00451e415ff3e1635719fbe20a448ea4d8f933c1..deeafd9e882e5955625df212645ea2db772272ab 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -31,10 +31,71 @@ static int starpu_initialized = 0; +#if defined(STARPU_HAVE_HWLOC) && defined(HAVE_STARPU_PARALLEL_WORKER) +void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt ) +{ + char *env_pw_level = chameleon_getenv( "CHAMELEON_PARALLEL_WORKER_LEVEL" ); + + if (env_pw_level != NULL) { + struct starpu_parallel_worker_config *pw_config = NULL; + + hwloc_obj_type_t pw_level; + int pw_level_number = 1; + char level[256]; + + int argc = strchr( env_pw_level, ':') == NULL ? 1 : 2; + int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number ); + + if ( (match != argc) || + ((match == 2) && (pw_level_number < 0) ) ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\" does not match the format level[:number] where number > 0.\n", env_pw_level ); + exit(1); + } + + if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\" does not match an hwloc level.\n", level ); + exit(1); + } + + pw_config = starpu_parallel_worker_init( pw_level, + STARPU_PARALLEL_WORKER_NB, pw_level_number, + STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_GNU_OPENMP_MKL, + 0 ); + + if ( pw_config == NULL ) + { + fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p + exit(1); + } + + if ( chameleon_env_on_off( "CHAMELEON_PARALLEL_WORKER_SHOW", CHAMELEON_FALSE ) == CHAMELEON_TRUE ) { + starpu_parallel_worker_print( pw_config ); + } + + sched_opt->pw_config = pw_config; + } + + chameleon_cleanenv( env_pw_level ); +} + +void chameleon_starpu_parallel_worker_fini( starpu_sched_opt_t *sched_opt ) +{ + if ( sched_opt->pw_config != NULL ) { + starpu_parallel_worker_shutdown( sched_opt->pw_config ); + sched_opt->pw_config = NULL; + } +} +#else +#define chameleon_starpu_parallel_worker_init(sched_opt) do { (void) sched_opt; } while(0) +#define chameleon_starpu_parallel_worker_fini(sched_opt) do { (void) sched_opt; } while(0) +#endif + /** * */ -static int chameleon_starpu_init( starpu_conf_t *conf ) +static int chameleon_starpu_init( struct starpu_conf *conf ) { int hres = CHAMELEON_SUCCESS; int rc; @@ -83,7 +144,8 @@ int RUNTIME_init( CHAM_context_t *chamctxt, int ncudas, int nthreads_per_worker ) { - starpu_conf_t *conf = (starpu_conf_t*)(chamctxt->schedopt); + starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt); + struct starpu_conf *conf = &sched_opt->starpu_conf; int hres = CHAMELEON_ERR_NOT_INITIALIZED; /* StarPU was already initialized by an external library */ @@ -119,8 +181,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt, if ((ncpus == -1)||(nthreads_per_worker == -1)) { - chamctxt->parallel_enabled = CHAMELEON_FALSE; - hres = chameleon_starpu_init( conf ); chamctxt->nworkers = ncpus; @@ -129,8 +189,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt, else { int worker; - chamctxt->parallel_enabled = CHAMELEON_TRUE; - for (worker = 0; worker < ncpus; worker++) conf->workers_bindid[worker] = (worker+1)*nthreads_per_worker - 1; @@ -152,11 +210,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt, starpu_initialized = 1; #ifdef HAVE_STARPU_MALLOC_ON_NODE_SET_DEFAULT_FLAGS - starpu_malloc_on_node_set_default_flags(STARPU_MAIN_RAM, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT + starpu_malloc_on_node_set_default_flags( STARPU_MAIN_RAM, + STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT #ifdef STARPU_MALLOC_SIMULATION_FOLDED - | STARPU_MALLOC_SIMULATION_FOLDED + | STARPU_MALLOC_SIMULATION_FOLDED #endif - ); + ); #endif #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) @@ -165,6 +224,7 @@ int RUNTIME_init( CHAM_context_t *chamctxt, starpu_cham_tile_interface_init(); + chameleon_starpu_parallel_worker_init( sched_opt ); return hres; } @@ -178,6 +238,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt ) return; } + starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt); + chameleon_starpu_parallel_worker_fini( sched_opt ); + starpu_cham_tile_interface_fini(); #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 761930adebb971bfe1f9bf8d25d16cd81521e4bc..e2d964ccc98ae105c4bd79ceac832f77c8ea89e2 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -36,6 +36,8 @@ #cmakedefine HAVE_STARPU_DATA_PEEK #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE +#cmakedefine HAVE_STARPU_PARALLEL_WORKER + #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER #cmakedefine HAVE_STARPU_MPI_COMM_RANK @@ -86,7 +88,11 @@ #include "runtime_workspace.h" #include "cham_tile_interface.h" -typedef struct starpu_conf starpu_conf_t; +typedef struct starpu_schedopt_s +{ + struct starpu_conf starpu_conf; /**< StarPU main configuration structure */ + struct starpu_parallel_worker_config *pw_config; /**< StarPU parallel workers configuration */ +} starpu_sched_opt_t; /* Structure used to give some options during one request (procedure) */ typedef struct starpu_option_request_s {