diff --git a/doc/user/CMakeLists.txt b/doc/user/CMakeLists.txt
index 776a06dff2d128e72d164578d55b35fa8eb6eb3a..2e60e1c2be040139a0b22e6d00343290cec38707 100644
--- a/doc/user/CMakeLists.txt
+++ b/doc/user/CMakeLists.txt
@@ -84,6 +84,7 @@ if(EMACS_COMPILER)
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/introduction.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/installing.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/using.org
+                               ${CMAKE_CURRENT_SOURCE_DIR}/chapters/parallel_worker.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/performances.org
                                ${CMAKE_CURRENT_BINARY_DIR}/CONTRIBUTING.org
                                ${CMAKE_CURRENT_BINARY_DIR}/publish.el
diff --git a/doc/user/chapters/lstopo-sirocco24.png b/doc/user/chapters/lstopo-sirocco24.png
new file mode 100644
index 0000000000000000000000000000000000000000..d1718aa80810d93326b7e39e5099c66aab7b3166
Binary files /dev/null and b/doc/user/chapters/lstopo-sirocco24.png differ
diff --git a/doc/user/chapters/parallel_worker.org b/doc/user/chapters/parallel_worker.org
new file mode 100644
index 0000000000000000000000000000000000000000..6023210712983792ffa1cdd53c2a9f7b2396504e
--- /dev/null
+++ b/doc/user/chapters/parallel_worker.org
@@ -0,0 +1,199 @@
+*** Using the =CHAMELEON_PARALLEL_WORKER= interface.
+  :PROPERTIES:
+  :CUSTOM_ID: interface-chameleon_parallel_worker
+  :END:
+
+The =CHAMELEON_PARALLEL_WORKER= interface is a extension only
+available with the StarPU runtime system that allows to run
+concurrently multi-threaded kernels.
+
+A StarPU parallel worker, previously called a cluster, is a set of
+workers which execute a single parallel task (see [[https://files.inria.fr/starpu/doc/html/ClusteringAMachine.html][StarPU Documentation]]).
+
+To use this functionnality:
+- StarPU must be compiled with the configure option =--enable-parallel-worker=
+- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it.
+
+Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it.
+
+**** Environment variables to configure the parallel workers
+   :PROPERTIES:
+   :CUSTOM_ID: environment-variables
+   :END:
+- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
+  : Specify the number of parallel workers per hardware-level - the
+  default value is 1. Note that hardware-level must correspond to an
+  hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=,
+  =MACHINE=.
+- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers
+  contents is displayed.
+
+**** Limitations
+   :PROPERTIES:
+   :CUSTOM_ID: limitations
+   :END:
+
+For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers.
+
+
+**** Examples
+
+In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind
+the main thread of StarPU to a dedicated CPU to a reserved CPU,
+subtracted from the CPU workers. This avoids using a whole parallel
+worker to make the submission.
+
+The machine has 64 CPUs. One is dedicated to the task submission, Two
+CPUs are dedicated to run the GPUs.
+
+#+caption: lstopo-sirocco24
+[[file:lstopo-sirocco24.png]]
+
+***** Example 1: Define a parallel worker per L3 cache (sirocco24)
+    :PROPERTIES:
+    :CUSTOM_ID: example-define-a-parallel-worker-per-l3-cache-sirocco24
+    :END:
+- Here we ask StarPU to create 1 parallel worker per L3 cache. The last
+  parallel worker does not have all the CPUs of the last L3 cache, as
+  there are 3 dedicated CPUs.
+
+#+begin_example
+$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+Number of parallel workers created: 8
+Parallel worker 0 contains the following logical indexes:
+    0 1 2 3 4 5 6 7
+Parallel worker 1 contains the following logical indexes:
+    8 9 10 11 12 13 14 15
+Parallel worker 2 contains the following logical indexes:
+    16 17 18 19 20 21 22 23
+Parallel worker 3 contains the following logical indexes:
+    24 25 26 27 28 29 30 31
+Parallel worker 4 contains the following logical indexes:
+    32 33 34 35 36 37 38 39
+Parallel worker 5 contains the following logical indexes:
+    40 41 42 43 44 45 46 47
+Parallel worker 6 contains the following logical indexes:
+    48 49 50 51 52 53 54 55
+Parallel worker 7 contains the following logical indexes:
+    56 57 58 59 60
+Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
+0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.282047e+00;2.141577e+04
+1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;3.404408e+00;2.064605e+04
+2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;3.427721e+00;2.050563e+04
+3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;3.707147e+00;1.896001e+04
+#+end_example
+
+***** Example 2: Define 2 parallel workers per socket (sirocco24)
+- Here we ask StarPU to create 2 parallel workers per socket. This ends
+  up with having the workers 45 and 46 in different parallel workers
+  even though they share the same L3 cache.
+    :PROPERTIES:
+    :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24
+    :END:
+
+#+begin_example
+$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+Number of parallel workers created: 4
+Parallel worker 0 contains the following logical indexes:
+    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+Parallel worker 1 contains the following logical indexes:
+    16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+Parallel worker 2 contains the following logical indexes:
+    32 33 34 35 36 37 38 39 40 41 42 43 44 45
+Parallel worker 3 contains the following logical indexes:
+    46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
+Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
+0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.256134e+00;2.158620e+04
+1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;7.003285e+00;1.003637e+04
+2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;8.816605e+00;7.972179e+03
+3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;1.064581e+01;6.602370e+03
+#+end_example
+
+**** How-to for the plafrim users
+   :PROPERTIES:
+   :CUSTOM_ID: downloading
+   :END:
+#+begin_example
+# Root directory
+PTCHAMELEON=~/PTCHAMELEON
+mkdir $PTCHAMELEON
+cd $PTCHAMELEON
+git clone git@gitlab.inria.fr:starpu/starpu.git
+git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
+#+end_example
+
+***** Setup on sirocco16 (2 cpu intel + 2 v100)
+   :PROPERTIES:
+   :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100
+   :END:
+#+begin_example
+module load build/cmake/3.15.3  \
+       linalg/mkl/2022.0.2      \
+       trace/eztrace/1.1-8      \
+       hardware/hwloc/2.7.0     \
+       compiler/gcc/11.2.0      \
+       compiler/cuda/11.6       \
+       mpi/openmpi/4.0.2        \
+       trace/fxt/0.3.14         \
+       trace/eztrace/1.1-9      \
+       language/python
+
+# Build StarPU
+cd $PTCHAMELEON/starpu
+./autogen.sh
+mkdir build && cd build
+# In case you want to debug take the first line
+#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt
+../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install
+make -j install
+source $PTCHAMELEON/starpu/build/install/bin/starpu_env
+
+# Build Chameleon
+cd  $PTCHAMELEON/chameleon
+mkdir build && cd build
+cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON  -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON
+make -j
+
+# test
+STARPU_SILENT=1
+STARPU_SCHED=dmdar
+CHAMELEON_PARALLEL_WORKER_LEVEL=L3
+CHAMELEON_PARALLEL_WORKER_SHOW=1
+STARPU_MAIN_THREAD_BIND=1
+STARPU_CUDA_PIPELINE=2
+STARPU_NWORKER_PER_CUDA=4
+STARPU_CALIBRATE=1
+$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2
+#+end_example
+
+***** Setup on sirocco24 (2 cpu amd + 2 a100)
+   :PROPERTIES:
+   :CUSTOM_ID: installation-sur-sirocco24-2-cpu-amd-2-a100
+   :END:
+Identical to sirocco16 except for the Intel MKL library:
+
+#+begin_example
+module load build/cmake/3.15.3      \
+            linalg/mkl/2020_update4 \
+            trace/eztrace/1.1-8     \
+            hardware/hwloc/2.7.0    \
+            compiler/gcc/11.2.0     \
+            compiler/cuda/11.6      \
+            mpi/openmpi/4.0.2       \
+            trace/fxt/0.3.14        \
+            trace/eztrace/1.1-9     \
+            language/python
+#+end_example
+
+Strangely the execution requires the creation of links:
+
+#+begin_example
+cd $PTCHAMELEON
+for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ;
+do
+    ln -s  /cm/shared/modules/amd/rome/compiler/intel/2020_update4/mkl/lib/intel64/$lib $lib.2
+done
+
+LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH
+#+end_example
+
diff --git a/doc/user/users_guide.org.in b/doc/user/users_guide.org.in
index 59436009411cdcf6b6900c54ff9ec64c5a59e702..55bea351062a3841aa38dfc1889aad41983556bf 100644
--- a/doc/user/users_guide.org.in
+++ b/doc/user/users_guide.org.in
@@ -70,3 +70,4 @@
 :END:
    <<sec:ug:using>>
  #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
+ #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/parallel_worker.org