diff --git a/doc/user/CMakeLists.txt b/doc/user/CMakeLists.txt index 776a06dff2d128e72d164578d55b35fa8eb6eb3a..2e60e1c2be040139a0b22e6d00343290cec38707 100644 --- a/doc/user/CMakeLists.txt +++ b/doc/user/CMakeLists.txt @@ -84,6 +84,7 @@ if(EMACS_COMPILER) ${CMAKE_CURRENT_SOURCE_DIR}/chapters/introduction.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/installing.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/using.org + ${CMAKE_CURRENT_SOURCE_DIR}/chapters/parallel_worker.org ${CMAKE_CURRENT_SOURCE_DIR}/chapters/performances.org ${CMAKE_CURRENT_BINARY_DIR}/CONTRIBUTING.org ${CMAKE_CURRENT_BINARY_DIR}/publish.el diff --git a/doc/user/chapters/lstopo-sirocco24.png b/doc/user/chapters/lstopo-sirocco24.png new file mode 100644 index 0000000000000000000000000000000000000000..d1718aa80810d93326b7e39e5099c66aab7b3166 Binary files /dev/null and b/doc/user/chapters/lstopo-sirocco24.png differ diff --git a/doc/user/chapters/parallel_worker.org b/doc/user/chapters/parallel_worker.org new file mode 100644 index 0000000000000000000000000000000000000000..6023210712983792ffa1cdd53c2a9f7b2396504e --- /dev/null +++ b/doc/user/chapters/parallel_worker.org @@ -0,0 +1,199 @@ +*** Using the =CHAMELEON_PARALLEL_WORKER= interface. + :PROPERTIES: + :CUSTOM_ID: interface-chameleon_parallel_worker + :END: + +The =CHAMELEON_PARALLEL_WORKER= interface is a extension only +available with the StarPU runtime system that allows to run +concurrently multi-threaded kernels. + +A StarPU parallel worker, previously called a cluster, is a set of +workers which execute a single parallel task (see [[https://files.inria.fr/starpu/doc/html/ClusteringAMachine.html][StarPU Documentation]]). + +To use this functionnality: +- StarPU must be compiled with the configure option =--enable-parallel-worker= +- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it. + +Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it. + +**** Environment variables to configure the parallel workers + :PROPERTIES: + :CUSTOM_ID: environment-variables + :END: +- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]= + : Specify the number of parallel workers per hardware-level - the + default value is 1. Note that hardware-level must correspond to an + hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=, + =MACHINE=. +- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers + contents is displayed. + +**** Limitations + :PROPERTIES: + :CUSTOM_ID: limitations + :END: + +For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers. + + +**** Examples + +In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind +the main thread of StarPU to a dedicated CPU to a reserved CPU, +subtracted from the CPU workers. This avoids using a whole parallel +worker to make the submission. + +The machine has 64 CPUs. One is dedicated to the task submission, Two +CPUs are dedicated to run the GPUs. + +#+caption: lstopo-sirocco24 +[[file:lstopo-sirocco24.png]] + +***** Example 1: Define a parallel worker per L3 cache (sirocco24) + :PROPERTIES: + :CUSTOM_ID: example-define-a-parallel-worker-per-l3-cache-sirocco24 + :END: +- Here we ask StarPU to create 1 parallel worker per L3 cache. The last + parallel worker does not have all the CPUs of the last L3 cache, as + there are 3 dedicated CPUs. + +#+begin_example +$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2 +Number of parallel workers created: 8 +Parallel worker 0 contains the following logical indexes: + 0 1 2 3 4 5 6 7 +Parallel worker 1 contains the following logical indexes: + 8 9 10 11 12 13 14 15 +Parallel worker 2 contains the following logical indexes: + 16 17 18 19 20 21 22 23 +Parallel worker 3 contains the following logical indexes: + 24 25 26 27 28 29 30 31 +Parallel worker 4 contains the following logical indexes: + 32 33 34 35 36 37 38 39 +Parallel worker 5 contains the following logical indexes: + 40 41 42 43 44 45 46 47 +Parallel worker 6 contains the following logical indexes: + 48 49 50 51 52 53 54 55 +Parallel worker 7 contains the following logical indexes: + 56 57 58 59 60 +Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops +0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.282047e+00;2.141577e+04 +1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;3.404408e+00;2.064605e+04 +2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;3.427721e+00;2.050563e+04 +3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;3.707147e+00;1.896001e+04 +#+end_example + +***** Example 2: Define 2 parallel workers per socket (sirocco24) +- Here we ask StarPU to create 2 parallel workers per socket. This ends + up with having the workers 45 and 46 in different parallel workers + even though they share the same L3 cache. + :PROPERTIES: + :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24 + :END: + +#+begin_example +$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480 -g 2 +Number of parallel workers created: 4 +Parallel worker 0 contains the following logical indexes: + 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 +Parallel worker 1 contains the following logical indexes: + 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 +Parallel worker 2 contains the following logical indexes: + 32 33 34 35 36 37 38 39 40 41 42 43 44 45 +Parallel worker 3 contains the following logical indexes: + 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 +Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops +0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.256134e+00;2.158620e+04 +1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;7.003285e+00;1.003637e+04 +2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;8.816605e+00;7.972179e+03 +3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;1.064581e+01;6.602370e+03 +#+end_example + +**** How-to for the plafrim users + :PROPERTIES: + :CUSTOM_ID: downloading + :END: +#+begin_example +# Root directory +PTCHAMELEON=~/PTCHAMELEON +mkdir $PTCHAMELEON +cd $PTCHAMELEON +git clone git@gitlab.inria.fr:starpu/starpu.git +git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git +#+end_example + +***** Setup on sirocco16 (2 cpu intel + 2 v100) + :PROPERTIES: + :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100 + :END: +#+begin_example +module load build/cmake/3.15.3 \ + linalg/mkl/2022.0.2 \ + trace/eztrace/1.1-8 \ + hardware/hwloc/2.7.0 \ + compiler/gcc/11.2.0 \ + compiler/cuda/11.6 \ + mpi/openmpi/4.0.2 \ + trace/fxt/0.3.14 \ + trace/eztrace/1.1-9 \ + language/python + +# Build StarPU +cd $PTCHAMELEON/starpu +./autogen.sh +mkdir build && cd build +# In case you want to debug take the first line +#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt +../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install +make -j install +source $PTCHAMELEON/starpu/build/install/bin/starpu_env + +# Build Chameleon +cd $PTCHAMELEON/chameleon +mkdir build && cd build +cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON +make -j + +# test +STARPU_SILENT=1 +STARPU_SCHED=dmdar +CHAMELEON_PARALLEL_WORKER_LEVEL=L3 +CHAMELEON_PARALLEL_WORKER_SHOW=1 +STARPU_MAIN_THREAD_BIND=1 +STARPU_CUDA_PIPELINE=2 +STARPU_NWORKER_PER_CUDA=4 +STARPU_CALIBRATE=1 +$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2 +#+end_example + +***** Setup on sirocco24 (2 cpu amd + 2 a100) + :PROPERTIES: + :CUSTOM_ID: installation-sur-sirocco24-2-cpu-amd-2-a100 + :END: +Identical to sirocco16 except for the Intel MKL library: + +#+begin_example +module load build/cmake/3.15.3 \ + linalg/mkl/2020_update4 \ + trace/eztrace/1.1-8 \ + hardware/hwloc/2.7.0 \ + compiler/gcc/11.2.0 \ + compiler/cuda/11.6 \ + mpi/openmpi/4.0.2 \ + trace/fxt/0.3.14 \ + trace/eztrace/1.1-9 \ + language/python +#+end_example + +Strangely the execution requires the creation of links: + +#+begin_example +cd $PTCHAMELEON +for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ; +do + ln -s /cm/shared/modules/amd/rome/compiler/intel/2020_update4/mkl/lib/intel64/$lib $lib.2 +done + +LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH +#+end_example + diff --git a/doc/user/users_guide.org.in b/doc/user/users_guide.org.in index 59436009411cdcf6b6900c54ff9ec64c5a59e702..55bea351062a3841aa38dfc1889aad41983556bf 100644 --- a/doc/user/users_guide.org.in +++ b/doc/user/users_guide.org.in @@ -70,3 +70,4 @@ :END: <<sec:ug:using>> #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org + #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/parallel_worker.org