diff --git a/control/context.c b/control/context.c
index f1d42c150871116a88f5658cd2c23a87694805fd..d724c82ad9eb61966a78e49756b79138de72b724 100644
--- a/control/context.c
+++ b/control/context.c
@@ -230,9 +230,9 @@ int CHAMELEON_Enable(int option)
             chameleon_error("CHAMELEON_Enable", "cannot enable GEMM3M (not available in cblas)");
 #endif
             break;
-        /* case CHAMELEON_PARALLEL: */
-        /*     chamctxt->parallel_enabled = CHAMELEON_TRUE; */
-        /*     break; */
+        case CHAMELEON_PARALLEL_KERNEL:
+            chamctxt->parallel_enabled = CHAMELEON_TRUE;
+            break;
         case CHAMELEON_GENERIC:
             chamctxt->generic_enabled = CHAMELEON_TRUE;
             break;
@@ -302,7 +302,7 @@ int CHAMELEON_Disable(int option)
             set_coreblas_gemm3m_enabled(0);
 #endif
             break;
-        case CHAMELEON_PARALLEL_MODE:
+        case CHAMELEON_PARALLEL_KERNEL:
             chamctxt->parallel_enabled = CHAMELEON_FALSE;
             break;
         case CHAMELEON_GENERIC:
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index 573706f8d8f3eadb9941006573fe9130bdad806e..6462ddd4a5142db2b9a457d14ffe1d3f1c41bfef 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -219,7 +219,7 @@ typedef enum chameleon_gemm_e {
 #define CHAMELEON_PROFILING_MODE      CHAMELEON_GENERATE_TRACE  /* _deprecated_ */
 #define CHAMELEON_GENERATE_STATS      6
 #define CHAMELEON_KERNELPROFILE_MODE  CHAMELEON_GENERATE_STATS  /* _deprecated_ */
-#define CHAMELEON_PARALLEL_MODE       7
+#define CHAMELEON_PARALLEL_KERNEL     7
 #define CHAMELEON_BOUND               8
 #define CHAMELEON_PROGRESS            9
 #define CHAMELEON_GEMM3M             10
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index b63ea2f7c108054181e6e3d18a4d20ae9b15cb66..2b4d0a1db651921f94fe509cd5976800e69c162a 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -79,6 +79,10 @@ if ( STARPU_FOUND )
   if ( HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS )
     message("-- ${Blue}Add definition HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS${ColourReset}")
   endif()
+  check_function_exists(starpu_parallel_worker_init HAVE_STARPU_PARALLEL_WORKER)
+  if ( HAVE_STARPU_PARALLEL_WORKER )
+    message("-- ${Blue}Add definition HAVE_STARPU_PARALLEL_WORKER${ColourReset}")
+  endif()
   check_struct_has_member( "struct starpu_data_interface_ops" reuse_data_on_node "starpu_data_interfaces.h" HAVE_STARPU_REUSE_DATA_ON_NODE LANGUAGE "C" )
   if ( HAVE_STARPU_REUSE_DATA_ON_NODE )
     message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}")
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 357a730efa27b07ea92d3a1eaede9918cd70ef9a..bc972bcf54562388b6c091049c21d972b79cadf9 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -255,6 +255,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 86afc9359a8f6bcbb219bea5021ffee5f180410d..ee2b3320fd9553ca4d258c4f9a06f4f9b91b3508 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -111,6 +111,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index fc64665bcefe751c29e21c75d53f125c1fab1173..2c7ac3e923d909452c3c524576d82fdbeff09061 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -144,6 +144,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 330076a5800bda4d03d25b4dca20a4fb0656cce0..0196649684a4652ff5686bc060aad7e7879a1f2b 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -136,6 +136,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index ba62e22f72dfb7cf7e33358c9492a64a75faab00..c83ab1283339f214c91c61f7643d75874e67658e 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -42,15 +42,15 @@ int _starpu_is_initialized(void);
  */
 void RUNTIME_context_create( CHAM_context_t *chamctxt )
 {
-    starpu_conf_t *conf;
-
     chamctxt->scheduler = RUNTIME_SCHED_STARPU;
 
-    if (! starpu_is_initialized() ) {
-        chamctxt->schedopt = (void*) malloc (sizeof(struct starpu_conf));
-        conf = chamctxt->schedopt;
+    if ( !starpu_is_initialized() ) {
+        starpu_sched_opt_t *sched_opt = malloc( sizeof(starpu_sched_opt_t) );
+
+        sched_opt->pw_config = NULL;
+        starpu_conf_init( &(sched_opt->starpu_conf) );
 
-        starpu_conf_init( conf );
+        chamctxt->schedopt = sched_opt;
     }
     else {
         chamctxt->schedopt = NULL;
@@ -65,8 +65,8 @@ void RUNTIME_context_create( CHAM_context_t *chamctxt )
 void RUNTIME_context_destroy( CHAM_context_t *chamctxt )
 {
     /* StarPU was already initialized by an external library */
-    if (chamctxt->schedopt) {
-        free(chamctxt->schedopt);
+    if ( chamctxt->schedopt ) {
+        free( chamctxt->schedopt );
     }
     return;
 }
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index 00451e415ff3e1635719fbe20a448ea4d8f933c1..deeafd9e882e5955625df212645ea2db772272ab 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -31,10 +31,71 @@
 
 static int starpu_initialized = 0;
 
+#if defined(STARPU_HAVE_HWLOC) && defined(HAVE_STARPU_PARALLEL_WORKER)
+void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
+{
+    char *env_pw_level = chameleon_getenv( "CHAMELEON_PARALLEL_WORKER_LEVEL" );
+
+    if (env_pw_level != NULL) {
+        struct starpu_parallel_worker_config *pw_config = NULL;
+
+        hwloc_obj_type_t pw_level;
+        int  pw_level_number = 1;
+        char level[256];
+
+        int argc  = strchr( env_pw_level, ':') == NULL ? 1 : 2;
+        int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number );
+
+        if ( (match != argc) ||
+             ((match == 2) && (pw_level_number < 0) ) )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match the format level[:number] where number > 0.\n", env_pw_level );
+            exit(1);
+        }
+
+        if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match an hwloc level.\n", level );
+            exit(1);
+        }
+
+        pw_config = starpu_parallel_worker_init( pw_level,
+                                                 STARPU_PARALLEL_WORKER_NB, pw_level_number,
+                                                 STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_GNU_OPENMP_MKL,
+                                                 0 );
+
+        if ( pw_config == NULL )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p
+            exit(1);
+        }
+
+        if ( chameleon_env_on_off( "CHAMELEON_PARALLEL_WORKER_SHOW", CHAMELEON_FALSE ) == CHAMELEON_TRUE ) {
+            starpu_parallel_worker_print( pw_config );
+        }
+
+        sched_opt->pw_config = pw_config;
+    }
+
+    chameleon_cleanenv( env_pw_level );
+}
+
+void chameleon_starpu_parallel_worker_fini( starpu_sched_opt_t *sched_opt )
+{
+    if ( sched_opt->pw_config != NULL ) {
+        starpu_parallel_worker_shutdown( sched_opt->pw_config );
+        sched_opt->pw_config = NULL;
+    }
+}
+#else
+#define chameleon_starpu_parallel_worker_init(sched_opt) do { (void) sched_opt; } while(0)
+#define chameleon_starpu_parallel_worker_fini(sched_opt) do { (void) sched_opt; } while(0)
+#endif
+
 /**
  *
  */
-static int chameleon_starpu_init( starpu_conf_t *conf )
+static int chameleon_starpu_init( struct starpu_conf *conf )
 {
     int hres = CHAMELEON_SUCCESS;
     int rc;
@@ -83,7 +144,8 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
                   int ncudas,
                   int nthreads_per_worker )
 {
-    starpu_conf_t *conf = (starpu_conf_t*)(chamctxt->schedopt);
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    struct starpu_conf *conf = &sched_opt->starpu_conf;
     int hres = CHAMELEON_ERR_NOT_INITIALIZED;
 
     /* StarPU was already initialized by an external library */
@@ -119,8 +181,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     if ((ncpus == -1)||(nthreads_per_worker == -1))
     {
-        chamctxt->parallel_enabled = CHAMELEON_FALSE;
-
         hres = chameleon_starpu_init( conf );
 
         chamctxt->nworkers = ncpus;
@@ -129,8 +189,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     else {
         int worker;
 
-        chamctxt->parallel_enabled = CHAMELEON_TRUE;
-
         for (worker = 0; worker < ncpus; worker++)
             conf->workers_bindid[worker] = (worker+1)*nthreads_per_worker - 1;
 
@@ -152,11 +210,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     starpu_initialized = 1;
 
 #ifdef HAVE_STARPU_MALLOC_ON_NODE_SET_DEFAULT_FLAGS
-    starpu_malloc_on_node_set_default_flags(STARPU_MAIN_RAM, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
+    starpu_malloc_on_node_set_default_flags( STARPU_MAIN_RAM,
+                                             STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
 #ifdef STARPU_MALLOC_SIMULATION_FOLDED
-            | STARPU_MALLOC_SIMULATION_FOLDED
+                                             | STARPU_MALLOC_SIMULATION_FOLDED
 #endif
-            );
+                                             );
 #endif
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
@@ -165,6 +224,7 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     starpu_cham_tile_interface_init();
 
+    chameleon_starpu_parallel_worker_init( sched_opt );
     return hres;
 }
 
@@ -178,6 +238,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt )
         return;
     }
 
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    chameleon_starpu_parallel_worker_fini( sched_opt );
+
     starpu_cham_tile_interface_fini();
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 761930adebb971bfe1f9bf8d25d16cd81521e4bc..e2d964ccc98ae105c4bd79ceac832f77c8ea89e2 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -36,6 +36,8 @@
 #cmakedefine HAVE_STARPU_DATA_PEEK
 #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS
 #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE
+#cmakedefine HAVE_STARPU_PARALLEL_WORKER
+
 #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE
 #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER
 #cmakedefine HAVE_STARPU_MPI_COMM_RANK
@@ -86,7 +88,11 @@
 #include "runtime_workspace.h"
 #include "cham_tile_interface.h"
 
-typedef struct starpu_conf starpu_conf_t;
+typedef struct starpu_schedopt_s
+{
+    struct starpu_conf                    starpu_conf; /**< StarPU main configuration structure   */
+    struct starpu_parallel_worker_config *pw_config;   /**< StarPU parallel workers configuration */
+} starpu_sched_opt_t;
 
 /* Structure used to give some options during one request (procedure) */
 typedef struct starpu_option_request_s {