diff --git a/doc/user/CMakeLists.txt b/doc/user/CMakeLists.txt
index 2e60e1c2be040139a0b22e6d00343290cec38707..b0ced19e0c612ad32adc89a07b8780c23bdd9632 100644
--- a/doc/user/CMakeLists.txt
+++ b/doc/user/CMakeLists.txt
@@ -58,6 +58,7 @@ set(FIGURES_USERGUIDE
     trace_qr.jpg
     potri_async.png
     chameleon_header.png
+    lstopo-sirocco24.png
     )
 foreach(_fig ${FIGURES_USERGUIDE})
     configure_file(${CMAKE_CURRENT_SOURCE_DIR}/chapters/${_fig}
diff --git a/doc/user/chapters/parallel_worker.org b/doc/user/chapters/parallel_worker.org
index 6023210712983792ffa1cdd53c2a9f7b2396504e..b18937ae9b1f01008caf0586f3dc3488cafe5563 100644
--- a/doc/user/chapters/parallel_worker.org
+++ b/doc/user/chapters/parallel_worker.org
@@ -12,7 +12,7 @@ workers which execute a single parallel task (see [[https://files.inria.fr/starp
 
 To use this functionnality:
 - StarPU must be compiled with the configure option =--enable-parallel-worker=
-- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it.
+- Chameleon automatically detects if the StarPU parallel workers are available or not and can exploit them, but you need to force Chameleon to be linked with multi-threaded BLAS library if you want the parallel workers to be able to do parallel BLAS calls. To do that, you must add =-DCHAMELEON_KERNELS_MT=ON= to your cmake line.
 
 Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it.
 
@@ -20,13 +20,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral
    :PROPERTIES:
    :CUSTOM_ID: environment-variables
    :END:
-- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
-  : Specify the number of parallel workers per hardware-level - the
-  default value is 1. Note that hardware-level must correspond to an
-  hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=,
-  =MACHINE=.
-- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers
-  contents is displayed.
+
+ - =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
+Specify the number of parallel workers per hardware-level. The default value is 1. Note that hardware-level must correspond to an hwloc machine level type (hwloc_obj_type_t) e.g.: =L2=, =L3=, =SOCKET=, =MACHINE=.
+ - =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers contents is displayed.
 
 **** Limitations
    :PROPERTIES:
@@ -35,11 +32,10 @@ Below are given some examples to use the couple Chameleon/StarPU to enable paral
 
 For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers.
 
-
 **** Examples
 
 In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind
-the main thread of StarPU to a dedicated CPU to a reserved CPU,
+the main thread of StarPU to a dedicated CPU,
 subtracted from the CPU workers. This avoids using a whole parallel
 worker to make the submission.
 
@@ -57,8 +53,18 @@ CPUs are dedicated to run the GPUs.
   parallel worker does not have all the CPUs of the last L3 cache, as
   there are 3 dedicated CPUs.
 
+#+begin_src sh
+CHAMELEON_PARALLEL_WORKER_LEVEL=L3 \
+CHAMELEON_PARALLEL_WORKER_SHOW=1 \
+STARPU_MAIN_THREAD_BIND=1 \
+STARPU_CALIBRATE=1 \
+STARPU_SCHED=dmdar \
+STARPU_NWORKER_PER_CUDA=2 \
+STARPU_SILENT=1 \
+$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+#+end_src
+
 #+begin_example
-$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
 Number of parallel workers created: 8
 Parallel worker 0 contains the following logical indexes:
     0 1 2 3 4 5 6 7
@@ -91,8 +97,18 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
     :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24
     :END:
 
+#+begin_src sh
+CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 \
+CHAMELEON_PARALLEL_WORKER_SHOW=1 \
+STARPU_MAIN_THREAD_BIND=1 \
+STARPU_CALIBRATE=1 \
+STARPU_SCHED=dmdar \
+STARPU_NWORKER_PER_CUDA=2 \
+STARPU_SILENT=1 \
+$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+#+end_src
+
 #+begin_example
-$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
 Number of parallel workers created: 4
 Parallel worker 0 contains the following logical indexes:
     0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
@@ -113,20 +129,20 @@ Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
    :PROPERTIES:
    :CUSTOM_ID: downloading
    :END:
-#+begin_example
+#+begin_src sh
 # Root directory
 PTCHAMELEON=~/PTCHAMELEON
 mkdir $PTCHAMELEON
 cd $PTCHAMELEON
 git clone git@gitlab.inria.fr:starpu/starpu.git
 git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
-#+end_example
+#+end_src
 
 ***** Setup on sirocco16 (2 cpu intel + 2 v100)
    :PROPERTIES:
    :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100
    :END:
-#+begin_example
+#+begin_src sh
 module load build/cmake/3.15.3  \
        linalg/mkl/2022.0.2      \
        trace/eztrace/1.1-8      \
@@ -143,28 +159,34 @@ cd $PTCHAMELEON/starpu
 ./autogen.sh
 mkdir build && cd build
 # In case you want to debug take the first line
-#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt
-../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install
+#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl \
+#             --disable-build-doc --enable-maxcpus=64 --disable-socl \
+#             --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt
+#
+../configure --enable-parallel-worker --disable-opencl --disable-build-doc \
+             --enable-maxcpus=64 --disable-socl \
+             --prefix=$PTCHAMELEON/starpu/build/install
 make -j install
 source $PTCHAMELEON/starpu/build/install/bin/starpu_env
 
 # Build Chameleon
 cd  $PTCHAMELEON/chameleon
 mkdir build && cd build
-cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON  -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON
+cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON \
+         -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON
 make -j
 
 # test
-STARPU_SILENT=1
-STARPU_SCHED=dmdar
-CHAMELEON_PARALLEL_WORKER_LEVEL=L3
-CHAMELEON_PARALLEL_WORKER_SHOW=1
-STARPU_MAIN_THREAD_BIND=1
-STARPU_CUDA_PIPELINE=2
-STARPU_NWORKER_PER_CUDA=4
-STARPU_CALIBRATE=1
-$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2
-#+end_example
+STARPU_SILENT=1                       \
+  STARPU_SCHED=dmdar                  \
+  CHAMELEON_PARALLEL_WORKER_LEVEL=L3  \
+  CHAMELEON_PARALLEL_WORKER_SHOW=1    \
+  STARPU_MAIN_THREAD_BIND=1           \
+  STARPU_CUDA_PIPELINE=2              \
+  STARPU_NWORKER_PER_CUDA=4           \
+  STARPU_CALIBRATE=1                  \
+  $PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2
+#+end_src
 
 ***** Setup on sirocco24 (2 cpu amd + 2 a100)
    :PROPERTIES:
@@ -172,7 +194,7 @@ $PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960
    :END:
 Identical to sirocco16 except for the Intel MKL library:
 
-#+begin_example
+#+begin_src sh
 module load build/cmake/3.15.3      \
             linalg/mkl/2020_update4 \
             trace/eztrace/1.1-8     \
@@ -183,11 +205,11 @@ module load build/cmake/3.15.3      \
             trace/fxt/0.3.14        \
             trace/eztrace/1.1-9     \
             language/python
-#+end_example
+#+end_src
 
 Strangely the execution requires the creation of links:
 
-#+begin_example
+#+begin_src sh
 cd $PTCHAMELEON
 for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ;
 do
@@ -195,5 +217,5 @@ do
 done
 
 LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH
-#+end_example
+#+end_src
 
diff --git a/include/chameleon/config.h.in b/include/chameleon/config.h.in
index a284fd2fd913519e07a92ee25ded2686ce2062f4..2b02d4bf4c9a08d2969bde5bd89bf0d162278112 100644
--- a/include/chameleon/config.h.in
+++ b/include/chameleon/config.h.in
@@ -37,6 +37,9 @@
 /* Debug coreblas execution order if not provided by the runtime */
 #cmakedefine CHAMELEON_KERNELS_TRACE
 
+/* Enable multi-threaded BLAS library */
+#cmakedefine CHAMELEON_KERNELS_MT
+
 /* Communication engine */
 #cmakedefine CHAMELEON_USE_MPI
 #cmakedefine CHAMELEON_USE_MIGRATE
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index deeafd9e882e5955625df212645ea2db772272ab..5a36b8a6f0a2643f148b09b90b900b9a323caa27 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -38,7 +38,6 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
 
     if (env_pw_level != NULL) {
         struct starpu_parallel_worker_config *pw_config = NULL;
-
         hwloc_obj_type_t pw_level;
         int  pw_level_number = 1;
         char level[256];
@@ -46,6 +45,10 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
         int argc  = strchr( env_pw_level, ':') == NULL ? 1 : 2;
         int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number );
 
+#if !defined(CHAMELEON_KERNELS_MT)
+        chameleon_warning("chameleon_starpu_parallel_worker_init()", "CHAMELEON has been compiled with multi-threaded kernels disabled (-DCHAMELEON_KERNELS_MT=OFF). This won't break the execution, but you may not obtain the performance gain expected. It is recommended to recompile with -DCHAMELEON_KERNELS_MT=ON.\n");
+#endif
+
         if ( (match != argc) ||
              ((match == 2) && (pw_level_number < 0) ) )
         {
@@ -66,7 +69,7 @@ void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
 
         if ( pw_config == NULL )
         {
-            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );
             exit(1);
         }