diff --git a/control/context.c b/control/context.c
index f1d42c150871116a88f5658cd2c23a87694805fd..d724c82ad9eb61966a78e49756b79138de72b724 100644
--- a/control/context.c
+++ b/control/context.c
@@ -230,9 +230,9 @@ int CHAMELEON_Enable(int option)
             chameleon_error("CHAMELEON_Enable", "cannot enable GEMM3M (not available in cblas)");
 #endif
             break;
-        /* case CHAMELEON_PARALLEL: */
-        /*     chamctxt->parallel_enabled = CHAMELEON_TRUE; */
-        /*     break; */
+        case CHAMELEON_PARALLEL_KERNEL:
+            chamctxt->parallel_enabled = CHAMELEON_TRUE;
+            break;
         case CHAMELEON_GENERIC:
             chamctxt->generic_enabled = CHAMELEON_TRUE;
             break;
@@ -302,7 +302,7 @@ int CHAMELEON_Disable(int option)
             set_coreblas_gemm3m_enabled(0);
 #endif
             break;
-        case CHAMELEON_PARALLEL_MODE:
+        case CHAMELEON_PARALLEL_KERNEL:
             chamctxt->parallel_enabled = CHAMELEON_FALSE;
             break;
         case CHAMELEON_GENERIC:
diff --git a/doc/user/CMakeLists.txt b/doc/user/CMakeLists.txt
index 776a06dff2d128e72d164578d55b35fa8eb6eb3a..2e60e1c2be040139a0b22e6d00343290cec38707 100644
--- a/doc/user/CMakeLists.txt
+++ b/doc/user/CMakeLists.txt
@@ -84,6 +84,7 @@ if(EMACS_COMPILER)
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/introduction.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/installing.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/using.org
+                               ${CMAKE_CURRENT_SOURCE_DIR}/chapters/parallel_worker.org
                                ${CMAKE_CURRENT_SOURCE_DIR}/chapters/performances.org
                                ${CMAKE_CURRENT_BINARY_DIR}/CONTRIBUTING.org
                                ${CMAKE_CURRENT_BINARY_DIR}/publish.el
diff --git a/doc/user/chapters/lstopo-sirocco24.png b/doc/user/chapters/lstopo-sirocco24.png
new file mode 100644
index 0000000000000000000000000000000000000000..d1718aa80810d93326b7e39e5099c66aab7b3166
Binary files /dev/null and b/doc/user/chapters/lstopo-sirocco24.png differ
diff --git a/doc/user/chapters/parallel_worker.org b/doc/user/chapters/parallel_worker.org
new file mode 100644
index 0000000000000000000000000000000000000000..6023210712983792ffa1cdd53c2a9f7b2396504e
--- /dev/null
+++ b/doc/user/chapters/parallel_worker.org
@@ -0,0 +1,199 @@
+*** Using the =CHAMELEON_PARALLEL_WORKER= interface.
+  :PROPERTIES:
+  :CUSTOM_ID: interface-chameleon_parallel_worker
+  :END:
+
+The =CHAMELEON_PARALLEL_WORKER= interface is a extension only
+available with the StarPU runtime system that allows to run
+concurrently multi-threaded kernels.
+
+A StarPU parallel worker, previously called a cluster, is a set of
+workers which execute a single parallel task (see [[https://files.inria.fr/starpu/doc/html/ClusteringAMachine.html][StarPU Documentation]]).
+
+To use this functionnality:
+- StarPU must be compiled with the configure option =--enable-parallel-worker=
+- Chameleon automatically detects if the StarPU parallel workers are available or not and does not need any specific cmake options to use it.
+
+Below are given some examples to use the couple Chameleon/StarPU to enable parallel tasks to be run concurrently. For now, this is only available for a few subset of tasks that are used in the Cholesky decomposition (=POTRF=, =TRSM=, =SYRK=, =HERK=, and =GEMM=) but all other algorithms using these kernels benefit from it.
+
+**** Environment variables to configure the parallel workers
+   :PROPERTIES:
+   :CUSTOM_ID: environment-variables
+   :END:
+- =CHAMELEON_PARALLEL_WORKER_LEVEL=hardware-level[:number-of-parallel-workers]=
+  : Specify the number of parallel workers per hardware-level - the
+  default value is 1. Note that hardware-level must correspond to an
+  hwloc machine level type (hwloc_obj_type_t) e.g. =L2=, =L3=, =SOCKET=,
+  =MACHINE=.
+- =CHAMELEON_PARALLEL_WORKER_SHOW= : When defined, the parallel workers
+  contents is displayed.
+
+**** Limitations
+   :PROPERTIES:
+   :CUSTOM_ID: limitations
+   :END:
+
+For now, there is still an issue of bad performances with the usage of the =lws= scheduler with the parallel workers.
+
+
+**** Examples
+
+In the following examples, =STARPU_MAIN_THREAD_BIND= is set to 1 to bind
+the main thread of StarPU to a dedicated CPU to a reserved CPU,
+subtracted from the CPU workers. This avoids using a whole parallel
+worker to make the submission.
+
+The machine has 64 CPUs. One is dedicated to the task submission, Two
+CPUs are dedicated to run the GPUs.
+
+#+caption: lstopo-sirocco24
+[[file:lstopo-sirocco24.png]]
+
+***** Example 1: Define a parallel worker per L3 cache (sirocco24)
+    :PROPERTIES:
+    :CUSTOM_ID: example-define-a-parallel-worker-per-l3-cache-sirocco24
+    :END:
+- Here we ask StarPU to create 1 parallel worker per L3 cache. The last
+  parallel worker does not have all the CPUs of the last L3 cache, as
+  there are 3 dedicated CPUs.
+
+#+begin_example
+$ CHAMELEON_PARALLEL_WORKER_LEVEL=L3 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+Number of parallel workers created: 8
+Parallel worker 0 contains the following logical indexes:
+    0 1 2 3 4 5 6 7
+Parallel worker 1 contains the following logical indexes:
+    8 9 10 11 12 13 14 15
+Parallel worker 2 contains the following logical indexes:
+    16 17 18 19 20 21 22 23
+Parallel worker 3 contains the following logical indexes:
+    24 25 26 27 28 29 30 31
+Parallel worker 4 contains the following logical indexes:
+    32 33 34 35 36 37 38 39
+Parallel worker 5 contains the following logical indexes:
+    40 41 42 43 44 45 46 47
+Parallel worker 6 contains the following logical indexes:
+    48 49 50 51 52 53 54 55
+Parallel worker 7 contains the following logical indexes:
+    56 57 58 59 60
+Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
+0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.282047e+00;2.141577e+04
+1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;3.404408e+00;2.064605e+04
+2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;3.427721e+00;2.050563e+04
+3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;3.707147e+00;1.896001e+04
+#+end_example
+
+***** Example 2: Define 2 parallel workers per socket (sirocco24)
+- Here we ask StarPU to create 2 parallel workers per socket. This ends
+  up with having the workers 45 and 46 in different parallel workers
+  even though they share the same L3 cache.
+    :PROPERTIES:
+    :CUSTOM_ID: example-define-2-parallel-workers-per-socket-sirocco24
+    :END:
+
+#+begin_example
+$ CHAMELEON_PARALLEL_WORKER_LEVEL=socket:2 CHAMELEON_PARALLEL_WORKER_SHOW=1 STARPU_MAIN_THREAD_BIND=1 STARPU_CALIBRATE=1 STARPU_SCHED=dmdar STARPU_NWORKER_PER_CUDA=2 STARPU_SILENT=1 ~/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 1440:3000:480  -g 2
+Number of parallel workers created: 4
+Parallel worker 0 contains the following logical indexes:
+    0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
+Parallel worker 1 contains the following logical indexes:
+    16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31
+Parallel worker 2 contains the following logical indexes:
+    32 33 34 35 36 37 38 39 40 41 42 43 44 45
+Parallel worker 3 contains the following logical indexes:
+    46 47 48 49 50 51 52 53 54 55 56 57 58 59 60
+Id;Function;threads;gpus;P;Q;mtxfmt;nb;uplo;n;lda;seedA;tsub;time;gflops
+0;dpotrf;61;2;1;1;0;1440;121;59520;59520;846930886;0.000000e+00;3.256134e+00;2.158620e+04
+1;dpotrf;61;2;1;1;0;1920;121;59520;59520;1681692777;0.000000e+00;7.003285e+00;1.003637e+04
+2;dpotrf;61;2;1;1;0;2400;121;59520;59520;1714636915;0.000000e+00;8.816605e+00;7.972179e+03
+3;dpotrf;61;2;1;1;0;2880;121;59520;59520;1957747793;0.000000e+00;1.064581e+01;6.602370e+03
+#+end_example
+
+**** How-to for the plafrim users
+   :PROPERTIES:
+   :CUSTOM_ID: downloading
+   :END:
+#+begin_example
+# Root directory
+PTCHAMELEON=~/PTCHAMELEON
+mkdir $PTCHAMELEON
+cd $PTCHAMELEON
+git clone git@gitlab.inria.fr:starpu/starpu.git
+git clone --recursive git@gitlab.inria.fr:solverstack/chameleon.git
+#+end_example
+
+***** Setup on sirocco16 (2 cpu intel + 2 v100)
+   :PROPERTIES:
+   :CUSTOM_ID: setup-on-sirocco16-2-cpu-intel-2-v100
+   :END:
+#+begin_example
+module load build/cmake/3.15.3  \
+       linalg/mkl/2022.0.2      \
+       trace/eztrace/1.1-8      \
+       hardware/hwloc/2.7.0     \
+       compiler/gcc/11.2.0      \
+       compiler/cuda/11.6       \
+       mpi/openmpi/4.0.2        \
+       trace/fxt/0.3.14         \
+       trace/eztrace/1.1-9      \
+       language/python
+
+# Build StarPU
+cd $PTCHAMELEON/starpu
+./autogen.sh
+mkdir build && cd build
+# In case you want to debug take the first line
+#../configure --enable-debug --enable-verbose --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install --enable-fxt
+../configure --enable-parallel-worker --disable-opencl --disable-build-doc --enable-maxcpus=64 --disable-socl --prefix=$PTCHAMELEON/starpu/build/install
+make -j install
+source $PTCHAMELEON/starpu/build/install/bin/starpu_env
+
+# Build Chameleon
+cd  $PTCHAMELEON/chameleon
+mkdir build && cd build
+cmake .. -DBLA_VENDOR=Intel10_64lp -DCHAMELEON_KERNELS_MT=ON  -DCHAMELEON_ENABLE_EXAMPLE=OFF -DCHAMELEON_USE_CUDA=ON
+make -j
+
+# test
+STARPU_SILENT=1
+STARPU_SCHED=dmdar
+CHAMELEON_PARALLEL_WORKER_LEVEL=L3
+CHAMELEON_PARALLEL_WORKER_SHOW=1
+STARPU_MAIN_THREAD_BIND=1
+STARPU_CUDA_PIPELINE=2
+STARPU_NWORKER_PER_CUDA=4
+STARPU_CALIBRATE=1
+$PTCHAMELEON/chameleon/build/testing/chameleon_dtesting -o potrf -n 59520 -b 960:3000:480 -g 2
+#+end_example
+
+***** Setup on sirocco24 (2 cpu amd + 2 a100)
+   :PROPERTIES:
+   :CUSTOM_ID: installation-sur-sirocco24-2-cpu-amd-2-a100
+   :END:
+Identical to sirocco16 except for the Intel MKL library:
+
+#+begin_example
+module load build/cmake/3.15.3      \
+            linalg/mkl/2020_update4 \
+            trace/eztrace/1.1-8     \
+            hardware/hwloc/2.7.0    \
+            compiler/gcc/11.2.0     \
+            compiler/cuda/11.6      \
+            mpi/openmpi/4.0.2       \
+            trace/fxt/0.3.14        \
+            trace/eztrace/1.1-9     \
+            language/python
+#+end_example
+
+Strangely the execution requires the creation of links:
+
+#+begin_example
+cd $PTCHAMELEON
+for lib in libmkl_gf_lp64.so libmkl_gnu_thread.so libmkl_intel_lp64.so libmkl_sequential.so ;
+do
+    ln -s  /cm/shared/modules/amd/rome/compiler/intel/2020_update4/mkl/lib/intel64/$lib $lib.2
+done
+
+LD_LIBRARY_PATH=$PTCHAMELEON:$LD_LIBRARY_PATH
+#+end_example
+
diff --git a/doc/user/users_guide.org.in b/doc/user/users_guide.org.in
index 59436009411cdcf6b6900c54ff9ec64c5a59e702..55bea351062a3841aa38dfc1889aad41983556bf 100644
--- a/doc/user/users_guide.org.in
+++ b/doc/user/users_guide.org.in
@@ -70,3 +70,4 @@
 :END:
    <<sec:ug:using>>
  #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/using.org
+ #+INCLUDE: @CMAKE_CURRENT_SOURCE_DIR@/chapters/parallel_worker.org
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index 573706f8d8f3eadb9941006573fe9130bdad806e..6462ddd4a5142db2b9a457d14ffe1d3f1c41bfef 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -219,7 +219,7 @@ typedef enum chameleon_gemm_e {
 #define CHAMELEON_PROFILING_MODE      CHAMELEON_GENERATE_TRACE  /* _deprecated_ */
 #define CHAMELEON_GENERATE_STATS      6
 #define CHAMELEON_KERNELPROFILE_MODE  CHAMELEON_GENERATE_STATS  /* _deprecated_ */
-#define CHAMELEON_PARALLEL_MODE       7
+#define CHAMELEON_PARALLEL_KERNEL     7
 #define CHAMELEON_BOUND               8
 #define CHAMELEON_PROGRESS            9
 #define CHAMELEON_GEMM3M             10
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index b63ea2f7c108054181e6e3d18a4d20ae9b15cb66..2b4d0a1db651921f94fe509cd5976800e69c162a 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -79,6 +79,10 @@ if ( STARPU_FOUND )
   if ( HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS )
     message("-- ${Blue}Add definition HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS${ColourReset}")
   endif()
+  check_function_exists(starpu_parallel_worker_init HAVE_STARPU_PARALLEL_WORKER)
+  if ( HAVE_STARPU_PARALLEL_WORKER )
+    message("-- ${Blue}Add definition HAVE_STARPU_PARALLEL_WORKER${ColourReset}")
+  endif()
   check_struct_has_member( "struct starpu_data_interface_ops" reuse_data_on_node "starpu_data_interfaces.h" HAVE_STARPU_REUSE_DATA_ON_NODE LANGUAGE "C" )
   if ( HAVE_STARPU_REUSE_DATA_ON_NODE )
     message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}")
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 357a730efa27b07ea92d3a1eaede9918cd70ef9a..bc972bcf54562388b6c091049c21d972b79cadf9 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -255,6 +255,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 86afc9359a8f6bcbb219bea5021ffee5f180410d..ee2b3320fd9553ca4d258c4f9a06f4f9b91b3508 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -111,6 +111,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index fc64665bcefe751c29e21c75d53f125c1fab1173..2c7ac3e923d909452c3c524576d82fdbeff09061 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -144,6 +144,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 330076a5800bda4d03d25b4dca20a4fb0656cce0..0196649684a4652ff5686bc060aad7e7879a1f2b 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -136,6 +136,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index ba62e22f72dfb7cf7e33358c9492a64a75faab00..c83ab1283339f214c91c61f7643d75874e67658e 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -42,15 +42,15 @@ int _starpu_is_initialized(void);
  */
 void RUNTIME_context_create( CHAM_context_t *chamctxt )
 {
-    starpu_conf_t *conf;
-
     chamctxt->scheduler = RUNTIME_SCHED_STARPU;
 
-    if (! starpu_is_initialized() ) {
-        chamctxt->schedopt = (void*) malloc (sizeof(struct starpu_conf));
-        conf = chamctxt->schedopt;
+    if ( !starpu_is_initialized() ) {
+        starpu_sched_opt_t *sched_opt = malloc( sizeof(starpu_sched_opt_t) );
+
+        sched_opt->pw_config = NULL;
+        starpu_conf_init( &(sched_opt->starpu_conf) );
 
-        starpu_conf_init( conf );
+        chamctxt->schedopt = sched_opt;
     }
     else {
         chamctxt->schedopt = NULL;
@@ -65,8 +65,8 @@ void RUNTIME_context_create( CHAM_context_t *chamctxt )
 void RUNTIME_context_destroy( CHAM_context_t *chamctxt )
 {
     /* StarPU was already initialized by an external library */
-    if (chamctxt->schedopt) {
-        free(chamctxt->schedopt);
+    if ( chamctxt->schedopt ) {
+        free( chamctxt->schedopt );
     }
     return;
 }
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index 00451e415ff3e1635719fbe20a448ea4d8f933c1..deeafd9e882e5955625df212645ea2db772272ab 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -31,10 +31,71 @@
 
 static int starpu_initialized = 0;
 
+#if defined(STARPU_HAVE_HWLOC) && defined(HAVE_STARPU_PARALLEL_WORKER)
+void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
+{
+    char *env_pw_level = chameleon_getenv( "CHAMELEON_PARALLEL_WORKER_LEVEL" );
+
+    if (env_pw_level != NULL) {
+        struct starpu_parallel_worker_config *pw_config = NULL;
+
+        hwloc_obj_type_t pw_level;
+        int  pw_level_number = 1;
+        char level[256];
+
+        int argc  = strchr( env_pw_level, ':') == NULL ? 1 : 2;
+        int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number );
+
+        if ( (match != argc) ||
+             ((match == 2) && (pw_level_number < 0) ) )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match the format level[:number] where number > 0.\n", env_pw_level );
+            exit(1);
+        }
+
+        if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match an hwloc level.\n", level );
+            exit(1);
+        }
+
+        pw_config = starpu_parallel_worker_init( pw_level,
+                                                 STARPU_PARALLEL_WORKER_NB, pw_level_number,
+                                                 STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_GNU_OPENMP_MKL,
+                                                 0 );
+
+        if ( pw_config == NULL )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p
+            exit(1);
+        }
+
+        if ( chameleon_env_on_off( "CHAMELEON_PARALLEL_WORKER_SHOW", CHAMELEON_FALSE ) == CHAMELEON_TRUE ) {
+            starpu_parallel_worker_print( pw_config );
+        }
+
+        sched_opt->pw_config = pw_config;
+    }
+
+    chameleon_cleanenv( env_pw_level );
+}
+
+void chameleon_starpu_parallel_worker_fini( starpu_sched_opt_t *sched_opt )
+{
+    if ( sched_opt->pw_config != NULL ) {
+        starpu_parallel_worker_shutdown( sched_opt->pw_config );
+        sched_opt->pw_config = NULL;
+    }
+}
+#else
+#define chameleon_starpu_parallel_worker_init(sched_opt) do { (void) sched_opt; } while(0)
+#define chameleon_starpu_parallel_worker_fini(sched_opt) do { (void) sched_opt; } while(0)
+#endif
+
 /**
  *
  */
-static int chameleon_starpu_init( starpu_conf_t *conf )
+static int chameleon_starpu_init( struct starpu_conf *conf )
 {
     int hres = CHAMELEON_SUCCESS;
     int rc;
@@ -83,7 +144,8 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
                   int ncudas,
                   int nthreads_per_worker )
 {
-    starpu_conf_t *conf = (starpu_conf_t*)(chamctxt->schedopt);
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    struct starpu_conf *conf = &sched_opt->starpu_conf;
     int hres = CHAMELEON_ERR_NOT_INITIALIZED;
 
     /* StarPU was already initialized by an external library */
@@ -119,8 +181,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     if ((ncpus == -1)||(nthreads_per_worker == -1))
     {
-        chamctxt->parallel_enabled = CHAMELEON_FALSE;
-
         hres = chameleon_starpu_init( conf );
 
         chamctxt->nworkers = ncpus;
@@ -129,8 +189,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     else {
         int worker;
 
-        chamctxt->parallel_enabled = CHAMELEON_TRUE;
-
         for (worker = 0; worker < ncpus; worker++)
             conf->workers_bindid[worker] = (worker+1)*nthreads_per_worker - 1;
 
@@ -152,11 +210,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     starpu_initialized = 1;
 
 #ifdef HAVE_STARPU_MALLOC_ON_NODE_SET_DEFAULT_FLAGS
-    starpu_malloc_on_node_set_default_flags(STARPU_MAIN_RAM, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
+    starpu_malloc_on_node_set_default_flags( STARPU_MAIN_RAM,
+                                             STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
 #ifdef STARPU_MALLOC_SIMULATION_FOLDED
-            | STARPU_MALLOC_SIMULATION_FOLDED
+                                             | STARPU_MALLOC_SIMULATION_FOLDED
 #endif
-            );
+                                             );
 #endif
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
@@ -165,6 +224,7 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     starpu_cham_tile_interface_init();
 
+    chameleon_starpu_parallel_worker_init( sched_opt );
     return hres;
 }
 
@@ -178,6 +238,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt )
         return;
     }
 
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    chameleon_starpu_parallel_worker_fini( sched_opt );
+
     starpu_cham_tile_interface_fini();
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 761930adebb971bfe1f9bf8d25d16cd81521e4bc..e2d964ccc98ae105c4bd79ceac832f77c8ea89e2 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -36,6 +36,8 @@
 #cmakedefine HAVE_STARPU_DATA_PEEK
 #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS
 #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE
+#cmakedefine HAVE_STARPU_PARALLEL_WORKER
+
 #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE
 #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER
 #cmakedefine HAVE_STARPU_MPI_COMM_RANK
@@ -86,7 +88,11 @@
 #include "runtime_workspace.h"
 #include "cham_tile_interface.h"
 
-typedef struct starpu_conf starpu_conf_t;
+typedef struct starpu_schedopt_s
+{
+    struct starpu_conf                    starpu_conf; /**< StarPU main configuration structure   */
+    struct starpu_parallel_worker_config *pw_config;   /**< StarPU parallel workers configuration */
+} starpu_sched_opt_t;
 
 /* Structure used to give some options during one request (procedure) */
 typedef struct starpu_option_request_s {