From 3b8cf6e7c0c5716275c0832b44ccab218ec359f4 Mon Sep 17 00:00:00 2001
From: Terry Cojean <terry.cojean@inria.fr>
Date: Tue, 31 Jan 2017 15:16:08 +0100
Subject: [PATCH] Integrate the use of parallel tasks for chameleon within
 POTRF algorithm. Note that this requires StarPU New cluster interface based
 on the hwloc vocabulary :
 CHAMELEON_CLUSTER_LEVEL=hardware-level:number-of-clusters and
 CHAMELEON_CLUSTER_SHOW to print the clusters

---
 control/context.c                            |  8 +-
 include/chameleon/constants.h                |  2 +-
 runtime/starpu/CMakeLists.txt                |  4 +
 runtime/starpu/codelets/codelet_zgemm.c      |  1 +
 runtime/starpu/codelets/codelet_zpotrf.c     |  1 +
 runtime/starpu/codelets/codelet_zsyrk.c      |  1 +
 runtime/starpu/codelets/codelet_ztrsm.c      |  1 +
 runtime/starpu/control/runtime_context.c     | 16 ++--
 runtime/starpu/control/runtime_control.c     | 81 +++++++++++++++++---
 runtime/starpu/include/chameleon_starpu.h.in |  8 +-
 10 files changed, 100 insertions(+), 23 deletions(-)

diff --git a/control/context.c b/control/context.c
index f1d42c150..d724c82ad 100644
--- a/control/context.c
+++ b/control/context.c
@@ -230,9 +230,9 @@ int CHAMELEON_Enable(int option)
             chameleon_error("CHAMELEON_Enable", "cannot enable GEMM3M (not available in cblas)");
 #endif
             break;
-        /* case CHAMELEON_PARALLEL: */
-        /*     chamctxt->parallel_enabled = CHAMELEON_TRUE; */
-        /*     break; */
+        case CHAMELEON_PARALLEL_KERNEL:
+            chamctxt->parallel_enabled = CHAMELEON_TRUE;
+            break;
         case CHAMELEON_GENERIC:
             chamctxt->generic_enabled = CHAMELEON_TRUE;
             break;
@@ -302,7 +302,7 @@ int CHAMELEON_Disable(int option)
             set_coreblas_gemm3m_enabled(0);
 #endif
             break;
-        case CHAMELEON_PARALLEL_MODE:
+        case CHAMELEON_PARALLEL_KERNEL:
             chamctxt->parallel_enabled = CHAMELEON_FALSE;
             break;
         case CHAMELEON_GENERIC:
diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h
index 573706f8d..6462ddd4a 100644
--- a/include/chameleon/constants.h
+++ b/include/chameleon/constants.h
@@ -219,7 +219,7 @@ typedef enum chameleon_gemm_e {
 #define CHAMELEON_PROFILING_MODE      CHAMELEON_GENERATE_TRACE  /* _deprecated_ */
 #define CHAMELEON_GENERATE_STATS      6
 #define CHAMELEON_KERNELPROFILE_MODE  CHAMELEON_GENERATE_STATS  /* _deprecated_ */
-#define CHAMELEON_PARALLEL_MODE       7
+#define CHAMELEON_PARALLEL_KERNEL     7
 #define CHAMELEON_BOUND               8
 #define CHAMELEON_PROGRESS            9
 #define CHAMELEON_GEMM3M             10
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index b63ea2f7c..2b4d0a1db 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -79,6 +79,10 @@ if ( STARPU_FOUND )
   if ( HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS )
     message("-- ${Blue}Add definition HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS${ColourReset}")
   endif()
+  check_function_exists(starpu_parallel_worker_init HAVE_STARPU_PARALLEL_WORKER)
+  if ( HAVE_STARPU_PARALLEL_WORKER )
+    message("-- ${Blue}Add definition HAVE_STARPU_PARALLEL_WORKER${ColourReset}")
+  endif()
   check_struct_has_member( "struct starpu_data_interface_ops" reuse_data_on_node "starpu_data_interfaces.h" HAVE_STARPU_REUSE_DATA_ON_NODE LANGUAGE "C" )
   if ( HAVE_STARPU_REUSE_DATA_ON_NODE )
     message("-- ${Blue}Add definition HAVE_STARPU_REUSE_DATA_ON_NODE${ColourReset}")
diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c
index 357a730ef..bc972bcf5 100644
--- a/runtime/starpu/codelets/codelet_zgemm.c
+++ b/runtime/starpu/codelets/codelet_zgemm.c
@@ -255,6 +255,7 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c
index 86afc9359..ee2b3320f 100644
--- a/runtime/starpu/codelets/codelet_zpotrf.c
+++ b/runtime/starpu/codelets/codelet_zpotrf.c
@@ -111,6 +111,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c
index fc64665bc..2c7ac3e92 100644
--- a/runtime/starpu/codelets/codelet_zsyrk.c
+++ b/runtime/starpu/codelets/codelet_zsyrk.c
@@ -144,6 +144,7 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c
index 330076a58..019664968 100644
--- a/runtime/starpu/codelets/codelet_ztrsm.c
+++ b/runtime/starpu/codelets/codelet_ztrsm.c
@@ -136,6 +136,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options,
         STARPU_PRIORITY,          options->priority,
         STARPU_CALLBACK,          callback,
         STARPU_EXECUTE_ON_WORKER, options->workerid,
+        STARPU_POSSIBLY_PARALLEL, options->parallel,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
         STARPU_NAME,              cl_name,
 #endif
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index ba62e22f7..c83ab1283 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -42,15 +42,15 @@ int _starpu_is_initialized(void);
  */
 void RUNTIME_context_create( CHAM_context_t *chamctxt )
 {
-    starpu_conf_t *conf;
-
     chamctxt->scheduler = RUNTIME_SCHED_STARPU;
 
-    if (! starpu_is_initialized() ) {
-        chamctxt->schedopt = (void*) malloc (sizeof(struct starpu_conf));
-        conf = chamctxt->schedopt;
+    if ( !starpu_is_initialized() ) {
+        starpu_sched_opt_t *sched_opt = malloc( sizeof(starpu_sched_opt_t) );
+
+        sched_opt->pw_config = NULL;
+        starpu_conf_init( &(sched_opt->starpu_conf) );
 
-        starpu_conf_init( conf );
+        chamctxt->schedopt = sched_opt;
     }
     else {
         chamctxt->schedopt = NULL;
@@ -65,8 +65,8 @@ void RUNTIME_context_create( CHAM_context_t *chamctxt )
 void RUNTIME_context_destroy( CHAM_context_t *chamctxt )
 {
     /* StarPU was already initialized by an external library */
-    if (chamctxt->schedopt) {
-        free(chamctxt->schedopt);
+    if ( chamctxt->schedopt ) {
+        free( chamctxt->schedopt );
     }
     return;
 }
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index 00451e415..deeafd9e8 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -31,10 +31,71 @@
 
 static int starpu_initialized = 0;
 
+#if defined(STARPU_HAVE_HWLOC) && defined(HAVE_STARPU_PARALLEL_WORKER)
+void chameleon_starpu_parallel_worker_init( starpu_sched_opt_t *sched_opt )
+{
+    char *env_pw_level = chameleon_getenv( "CHAMELEON_PARALLEL_WORKER_LEVEL" );
+
+    if (env_pw_level != NULL) {
+        struct starpu_parallel_worker_config *pw_config = NULL;
+
+        hwloc_obj_type_t pw_level;
+        int  pw_level_number = 1;
+        char level[256];
+
+        int argc  = strchr( env_pw_level, ':') == NULL ? 1 : 2;
+        int match = sscanf( env_pw_level, "%[^:]:%d", level, &pw_level_number );
+
+        if ( (match != argc) ||
+             ((match == 2) && (pw_level_number < 0) ) )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match the format level[:number] where number > 0.\n", env_pw_level );
+            exit(1);
+        }
+
+        if ( hwloc_type_sscanf( level, &pw_level, NULL, 0 ) == -1 )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL \"%s\"  does not match an hwloc level.\n", level );
+            exit(1);
+        }
+
+        pw_config = starpu_parallel_worker_init( pw_level,
+                                                 STARPU_PARALLEL_WORKER_NB, pw_level_number,
+                                                 STARPU_PARALLEL_WORKER_TYPE, STARPU_PARALLEL_WORKER_GNU_OPENMP_MKL,
+                                                 0 );
+
+        if ( pw_config == NULL )
+        {
+            fprintf( stderr, "error CHAMELEON_PARALLEL_WORKER_LEVEL : cannot create a parallel worker at %s level.\n", level );p
+            exit(1);
+        }
+
+        if ( chameleon_env_on_off( "CHAMELEON_PARALLEL_WORKER_SHOW", CHAMELEON_FALSE ) == CHAMELEON_TRUE ) {
+            starpu_parallel_worker_print( pw_config );
+        }
+
+        sched_opt->pw_config = pw_config;
+    }
+
+    chameleon_cleanenv( env_pw_level );
+}
+
+void chameleon_starpu_parallel_worker_fini( starpu_sched_opt_t *sched_opt )
+{
+    if ( sched_opt->pw_config != NULL ) {
+        starpu_parallel_worker_shutdown( sched_opt->pw_config );
+        sched_opt->pw_config = NULL;
+    }
+}
+#else
+#define chameleon_starpu_parallel_worker_init(sched_opt) do { (void) sched_opt; } while(0)
+#define chameleon_starpu_parallel_worker_fini(sched_opt) do { (void) sched_opt; } while(0)
+#endif
+
 /**
  *
  */
-static int chameleon_starpu_init( starpu_conf_t *conf )
+static int chameleon_starpu_init( struct starpu_conf *conf )
 {
     int hres = CHAMELEON_SUCCESS;
     int rc;
@@ -83,7 +144,8 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
                   int ncudas,
                   int nthreads_per_worker )
 {
-    starpu_conf_t *conf = (starpu_conf_t*)(chamctxt->schedopt);
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    struct starpu_conf *conf = &sched_opt->starpu_conf;
     int hres = CHAMELEON_ERR_NOT_INITIALIZED;
 
     /* StarPU was already initialized by an external library */
@@ -119,8 +181,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     if ((ncpus == -1)||(nthreads_per_worker == -1))
     {
-        chamctxt->parallel_enabled = CHAMELEON_FALSE;
-
         hres = chameleon_starpu_init( conf );
 
         chamctxt->nworkers = ncpus;
@@ -129,8 +189,6 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     else {
         int worker;
 
-        chamctxt->parallel_enabled = CHAMELEON_TRUE;
-
         for (worker = 0; worker < ncpus; worker++)
             conf->workers_bindid[worker] = (worker+1)*nthreads_per_worker - 1;
 
@@ -152,11 +210,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
     starpu_initialized = 1;
 
 #ifdef HAVE_STARPU_MALLOC_ON_NODE_SET_DEFAULT_FLAGS
-    starpu_malloc_on_node_set_default_flags(STARPU_MAIN_RAM, STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
+    starpu_malloc_on_node_set_default_flags( STARPU_MAIN_RAM,
+                                             STARPU_MALLOC_PINNED | STARPU_MALLOC_COUNT
 #ifdef STARPU_MALLOC_SIMULATION_FOLDED
-            | STARPU_MALLOC_SIMULATION_FOLDED
+                                             | STARPU_MALLOC_SIMULATION_FOLDED
 #endif
-            );
+                                             );
 #endif
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
@@ -165,6 +224,7 @@ int RUNTIME_init( CHAM_context_t *chamctxt,
 
     starpu_cham_tile_interface_init();
 
+    chameleon_starpu_parallel_worker_init( sched_opt );
     return hres;
 }
 
@@ -178,6 +238,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt )
         return;
     }
 
+    starpu_sched_opt_t *sched_opt = (starpu_sched_opt_t*)(chamctxt->schedopt);
+    chameleon_starpu_parallel_worker_fini( sched_opt );
+
     starpu_cham_tile_interface_fini();
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 761930ade..e2d964ccc 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -36,6 +36,8 @@
 #cmakedefine HAVE_STARPU_DATA_PEEK
 #cmakedefine HAVE_STARPU_SET_LIMIT_SUBMITTED_TASKS
 #cmakedefine HAVE_STARPU_REUSE_DATA_ON_NODE
+#cmakedefine HAVE_STARPU_PARALLEL_WORKER
+
 #cmakedefine HAVE_STARPU_MPI_DATA_MIGRATE
 #cmakedefine HAVE_STARPU_MPI_DATA_REGISTER
 #cmakedefine HAVE_STARPU_MPI_COMM_RANK
@@ -86,7 +88,11 @@
 #include "runtime_workspace.h"
 #include "cham_tile_interface.h"
 
-typedef struct starpu_conf starpu_conf_t;
+typedef struct starpu_schedopt_s
+{
+    struct starpu_conf                    starpu_conf; /**< StarPU main configuration structure   */
+    struct starpu_parallel_worker_config *pw_config;   /**< StarPU parallel workers configuration */
+} starpu_sched_opt_t;
 
 /* Structure used to give some options during one request (procedure) */
 typedef struct starpu_option_request_s {
-- 
GitLab