Mentions légales du service

Skip to content
Snippets Groups Projects
Commit c811f25a authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'issue/parallel_worker' into 'master'

Fix documentation and code issues on parallel worker

See merge request !358
parents d9945af5 8ff7128f
No related branches found
No related tags found
1 merge request!358Fix documentation and code issues on parallel worker
...@@ -58,6 +58,7 @@ set(FIGURES_USERGUIDE ...@@ -58,6 +58,7 @@ set(FIGURES_USERGUIDE
trace_qr.jpg trace_qr.jpg
potri_async.png potri_async.png
chameleon_header.png chameleon_header.png
lstopo-sirocco24.png
) )
foreach(_fig ${FIGURES_USERGUIDE}) foreach(_fig ${FIGURES_USERGUIDE})
configure_file(${CMAKE_CURRENT_SOURCE_DIR}/chapters/${_fig} configure_file(${CMAKE_CURRENT_SOURCE_DIR}/chapters/${_fig}
......
...@@ -12,7 +12,7 @@ workers which execute a single parallel task (see [[https://files.inria.fr/starp ...@@ -12,7 +12,7 @@ workers which execute a single parallel task (see [[https://files.inria.fr/starp
To use this functionnality: To use this functionnality:
- StarPU must be compiled with the configure option =--enable-parallel-worker= - StarPU must be compiled with the configure option =--enable-parallel-worker=
- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it. - Chameleon automatically detects if the StarPU parallel workers are available or not and can exploit them, but you need to force Chameleon to be linked with multi-threaded BLAS library if you want the parallel workers to be able to do parallel BLAS calls. To do that, you must add =-DCHAMELEON_KERNELS_MT=ON= to your cmake line.
Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it. Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it.
...@@ -20,13 +20,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral ...@@ -20,13 +20,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral
:PROPERTIES: :PROPERTIES:
:CUSTOM_ID: environment-variables :CUSTOM_ID: environment-variables
:END: :END:
- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
: Specify the number of parallel workers per hardware-level - the - =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
default value is 1. Note that hardware-level must correspond to an Specify the number of parallel workers per hardware-level. The default value is 1. Note that hardware-level must correspond to an hwloc machine level type (hwloc_obj_type_t) e.g.: =L2=, =L3=, =SOCKET=, =MACHINE=.
hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=, - =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers contents is displayed.
=MACHINE=.
- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers
contents is displayed.
**** Limitations **** Limitations
:PROPERTIES: :PROPERTIES:
...@@ -35,11 +32,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral ...@@ -35,11 +32,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral
For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers. For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers.
**** Examples **** Examples
In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind
the main thread of StarPU to a dedicated CPU to a reserved CPU, the main thread of StarPU to a dedicated CPU,
subtracted from the CPU workers. This avoids using a whole parallel subtracted from the CPU workers. This avoids using a whole parallel
worker to make the submission. worker to make the submission.
...@@ -57,8 +53,18 @@ CPUs are dedicated to run the GPUs. ...@@ -57,8 +53,18 @@ CPUs are dedicated to run the GPUs.
parallel worker does not have all the CPUs of the last L3 cache, as parallel worker does not have all the CPUs of the last L3 cache, as
there are 3 dedicated CPUs. there are 3 dedicated CPUs.
#+begin_src sh
CHAMELEON_PARALLEL_WORKER_LEVEL=L3 \
CHAMELEON_PARALLEL_WORKER_SHOW=1 \
STARPU_MAIN_THREAD_BIND=1 \
STARPU_CALIBRATE=1 \
STARPU_SCHED=dmdar \
STARPU_NWORKER_PER_CUDA=2 \
STARPU_SILENT=1 \
$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2
#+end_src
#+begin_example #+begin_example
$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2
Number of parallel workers created: 8 Number of parallel workers created: 8
Parallel worker 0 contains the following logical indexes: Parallel worker 0 contains the following logical indexes:
0 1 2 3 4 5 6 7 0 1 2 3 4 5 6 7
...@@ -91,8 +97,18 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops ...@@ -91,8 +97,18 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
:CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24 :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24
:END: :END:
#+begin_src sh
CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 \
CHAMELEON_PARALLEL_WORKER_SHOW=1 \
STARPU_MAIN_THREAD_BIND=1 \
STARPU_CALIBRATE=1 \
STARPU_SCHED=dmdar \
STARPU_NWORKER_PER_CUDA=2 \
STARPU_SILENT=1 \
$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2
#+end_src
#+begin_example #+begin_example
$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2
Number of parallel workers created: 4 Number of parallel workers created: 4
Parallel worker 0 contains the following logical indexes: Parallel worker 0 contains the following logical indexes:
0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
...@@ -113,20 +129,20 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops ...@@ -113,20 +129,20 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
:PROPERTIES: :PROPERTIES:
:CUSTOM_ID: downloading :CUSTOM_ID: downloading
:END: :END:
#+begin_example #+begin_src sh
# Root directory # Root directory
PTCHAMELEON=~/PTCHAMELEON PTCHAMELEON=~/PTCHAMELEON
mkdir $PTCHAMELEON mkdir $PTCHAMELEON
cd $PTCHAMELEON cd $PTCHAMELEON
git clone git@gitlab.inria.fr:starpu/starpu.git git clone git@gitlab.inria.fr:starpu/starpu.git
git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
#+end_example #+end_src
***** Setup on sirocco16 (2 cpu intel + 2 v100) ***** Setup on sirocco16 (2 cpu intel + 2 v100)
:PROPERTIES: :PROPERTIES:
:CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100 :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100
:END: :END:
#+begin_example #+begin_src sh
module load build/cmake/3.15.3 \ module load build/cmake/3.15.3 \
linalg/mkl/2022.0.2 \ linalg/mkl/2022.0.2 \
trace/eztrace/1.1-8 \ trace/eztrace/1.1-8 \
...@@ -143,28 +159,34 @@ cd $PTCHAMELEON/starpu ...@@ -143,28 +159,34 @@ cd $PTCHAMELEON/starpu
./autogen.sh ./autogen.sh
mkdir build && cd build mkdir build && cd build
# In case you want to debug take the first line # In case you want to debug take the first line
#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt #../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl \
../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install # --disable-build-doc --enable-maxcpus=64 --disable-socl \
# --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt
#
../configure --enable-parallel-worker --disable-opencl --disable-build-doc \
--enable-maxcpus=64 --disable-socl \
--prefix=$PTCHAMELEON/starpu/build/install
make -j install make -j install
source $PTCHAMELEON/starpu/build/install/bin/starpu_env source $PTCHAMELEON/starpu/build/install/bin/starpu_env
# Build Chameleon # Build Chameleon
cd $PTCHAMELEON/chameleon cd $PTCHAMELEON/chameleon
mkdir build && cd build mkdir build && cd build
cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON \
-DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON
make -j make -j
# test # test
STARPU_SILENT=1 STARPU_SILENT=1 \
STARPU_SCHED=dmdar STARPU_SCHED=dmdar \
CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_LEVEL=L3 \
CHAMELEON_PARALLEL_WORKER_SHOW=1 CHAMELEON_PARALLEL_WORKER_SHOW=1 \
STARPU_MAIN_THREAD_BIND=1 STARPU_MAIN_THREAD_BIND=1 \
STARPU_CUDA_PIPELINE=2 STARPU_CUDA_PIPELINE=2 \
STARPU_NWORKER_PER_CUDA=4 STARPU_NWORKER_PER_CUDA=4 \
STARPU_CALIBRATE=1 STARPU_CALIBRATE=1 \
$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2 $PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2
#+end_example #+end_src
***** Setup on sirocco24 (2 cpu amd + 2 a100) ***** Setup on sirocco24 (2 cpu amd + 2 a100)
:PROPERTIES: :PROPERTIES:
...@@ -172,7 +194,7 @@ $PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960 ...@@ -172,7 +194,7 @@ $PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960
:END: :END:
Identical to sirocco16 except for the Intel MKL library: Identical to sirocco16 except for the Intel MKL library:
#+begin_example #+begin_src sh
module load build/cmake/3.15.3 \ module load build/cmake/3.15.3 \
linalg/mkl/2020_update4 \ linalg/mkl/2020_update4 \
trace/eztrace/1.1-8 \ trace/eztrace/1.1-8 \
...@@ -183,11 +205,11 @@ module load build/cmake/3.15.3 \ ...@@ -183,11 +205,11 @@ module load build/cmake/3.15.3 \
trace/fxt/0.3.14 \ trace/fxt/0.3.14 \
trace/eztrace/1.1-9 \ trace/eztrace/1.1-9 \
language/python language/python
#+end_example #+end_src
Strangely the execution requires the creation of links: Strangely the execution requires the creation of links:
#+begin_example #+begin_src sh
cd $PTCHAMELEON cd $PTCHAMELEON
for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ; for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ;
do do
...@@ -195,5 +217,5 @@ do ...@@ -195,5 +217,5 @@ do
done done
LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH
#+end_example #+end_src
...@@ -37,6 +37,9 @@ ...@@ -37,6 +37,9 @@
/* Debug coreblas execution order if not provided by the runtime */ /* Debug coreblas execution order if not provided by the runtime */
#cmakedefine CHAMELEON_KERNELS_TRACE #cmakedefine CHAMELEON_KERNELS_TRACE
/* Enable multi-threaded BLAS library */
#cmakedefine CHAMELEON_KERNELS_MT
/* Communication engine */ /* Communication engine */
#cmakedefine CHAMELEON_USE_MPI #cmakedefine CHAMELEON_USE_MPI
#cmakedefine CHAMELEON_USE_MIGRATE #cmakedefine CHAMELEON_USE_MIGRATE
......
...@@ -38,7 +38,6 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt ) ...@@ -38,7 +38,6 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
if (env_pw_level != NULL) { if (env_pw_level != NULL) {
struct starpu_parallel_worker_config *pw_config = NULL; struct starpu_parallel_worker_config *pw_config = NULL;
hwloc_obj_type_t pw_level; hwloc_obj_type_t pw_level;
int pw_level_number = 1; int pw_level_number = 1;
char level[256]; char level[256];
...@@ -46,6 +45,10 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt ) ...@@ -46,6 +45,10 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
int argc = strchr( env_pw_level, ':') == NULL ? 1 : 2; int argc = strchr( env_pw_level, ':') == NULL ? 1 : 2;
int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number ); int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number );
#if !defined(CHAMELEON_KERNELS_MT)
chameleon_warning("chameleon_starpu_parallel_worker_init()", "CHAMELEON has been compiled with multi-threaded kernels disabled (-DCHAMELEON_KERNELS_MT=OFF). This won't break the execution, but you may not obtain the performance gain expected. It is recommended to recompile with -DCHAMELEON_KERNELS_MT=ON.\n");
#endif
if ( (match != argc) || if ( (match != argc) ||
((match == 2) && (pw_level_number < 0) ) ) ((match == 2) && (pw_level_number < 0) ) )
{ {
...@@ -66,7 +69,7 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt ) ...@@ -66,7 +69,7 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
if ( pw_config == NULL ) if ( pw_config == NULL )
{ {
fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );
exit(1); exit(1);
} }
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment