From d45d16367e2c46c4a014766146ac4b02807ee265 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 10 Jan 2018 19:10:21 +0100
Subject: [PATCH] Apply cleanup to the starpu implementation of the API

---
 control/auxiliary.c                          |  43 ++--
 control/control.c                            |  42 ++--
 control/descriptor.c                         |  25 ++-
 include/chameleon/morse_runtime.h            |  18 +-
 include/morse.h                              |   4 +-
 runtime/starpu/codelets/codelet_dataflush.c  |   4 +-
 runtime/starpu/control/runtime_async.c       |  59 ++----
 runtime/starpu/control/runtime_context.c     |   1 -
 runtime/starpu/control/runtime_control.c     | 143 +++++++------
 runtime/starpu/control/runtime_descriptor.c  | 202 ++++++++++++-------
 runtime/starpu/control/runtime_profiling.c   |  19 ++
 runtime/starpu/include/chameleon_starpu.h.in |  10 +
 12 files changed, 349 insertions(+), 221 deletions(-)

diff --git a/control/auxiliary.c b/control/auxiliary.c
index 00c6bdc59..b71dafa52 100644
--- a/control/auxiliary.c
+++ b/control/auxiliary.c
@@ -107,7 +107,7 @@ void morse_fatal_error(const char *func_name, const char *msg_text)
  **/
 int morse_rank(MORSE_context_t *morse)
 {
-    return RUNTIME_rank( morse );
+    return RUNTIME_thread_rank( morse );
 }
 
 /*******************************************************************************
@@ -233,31 +233,36 @@ int MORSE_My_Mpi_Rank(void)
     return MORSE_SUCCESS;
 #endif
 }
+
 /*******************************************************************************
  *  Display a progress percentage in stderr
  **/
 void update_progress(int currentValue, int maximumValue) {
-  div_t res ;
-  static int progress = -1; /* varie de 0 a 100 au cours du calcul concerne */
+    div_t res ;
+    static int progress = -1; /* varie de 0 a 100 au cours du calcul concerne */
 
-  if (maximumValue==0)
-    res.quot=100 ;
-  else {
-    if (currentValue<INT_MAX/100)
-      res=div(currentValue*100, maximumValue) ;
-    /* Calcule le quotient de la division */
-    else
-      res.quot=(int)( (long long) currentValue*100/maximumValue) ;
-  }
+    if (maximumValue == 0) {
+        res.quot = 100;
+    }
+    else {
+        if (currentValue < (INT_MAX / 100) ) {
+            res = div(currentValue*100, maximumValue);
+        }
+        else {
+            /* Calcule le quotient de la division */
+            res.quot = (int)( (long long)( currentValue * 100 ) / maximumValue );
+        }
+    }
 
-  // Print the percentage
-  if (res.quot > progress)
-    fprintf(stderr, "%3d%%\b\b\b\b", res.quot) ;
-  progress=res.quot ;
+    // Print the percentage
+    if (res.quot > progress) {
+        fprintf(stderr, "%3d%%\b\b\b\b", res.quot);
+    }
+    progress = res.quot;
 
-  if (currentValue>=maximumValue) {
-    progress=-1 ;
-  }
+    if (currentValue >= maximumValue) {
+        progress = -1;
+    }
 }
 
 // A function to display the progress indicator.
diff --git a/control/control.c b/control/control.c
index 4a9dff466..fb742b5b4 100644
--- a/control/control.c
+++ b/control/control.c
@@ -111,7 +111,7 @@ int MORSE_InitPar(int ncpus, int ncudas, int nthreads_per_worker)
     }
 #  endif
 #endif
-    RUNTIME_init_scheduler( morse, ncpus, ncudas, nthreads_per_worker );
+    RUNTIME_init( morse, ncpus, ncudas, nthreads_per_worker );
     return MORSE_SUCCESS;
 }
 
@@ -138,7 +138,7 @@ int MORSE_Finalize(void)
 #  if !defined(CHAMELEON_SIMULATION)
     RUNTIME_barrier(morse);
 #  endif
-    RUNTIME_finalize_scheduler( morse );
+    RUNTIME_finalize( morse );
 
 #if defined(CHAMELEON_USE_MPI)
     if (!morse->mpi_outer_init)
@@ -250,14 +250,19 @@ int MORSE_Distributed_stop(void)
  *
  ******************************************************************************
  *
- * @return
- *          \retval MORSE_SUCCESS successful exit
+ * @retval The size of the distributed computation
+ * @retval -1 if context not initialized
  *
  *****************************************************************************/
-int MORSE_Comm_size( int *size )
+int MORSE_Comm_size()
 {
-    RUNTIME_comm_size (size);
-    return MORSE_SUCCESS;
+    MORSE_context_t *morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_Comm_size()", "MORSE not initialized");
+        return -1;
+    }
+
+    return RUNTIME_comm_size( morse );
 }
 
 /** ***************************************************************************
@@ -268,14 +273,19 @@ int MORSE_Comm_size( int *size )
  *
  ******************************************************************************
  *
- * @return
- *          \retval MORSE_SUCCESS successful exit
+ * @retval The rank of the distributed computation
+ * @retval -1 if context not initialized
  *
  *****************************************************************************/
-int MORSE_Comm_rank( int *rank )
+int MORSE_Comm_rank()
 {
-    RUNTIME_comm_rank (rank);
-    return MORSE_SUCCESS;
+    MORSE_context_t *morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_Comm_rank()", "MORSE not initialized");
+        return -1;
+    }
+
+    return RUNTIME_comm_rank( morse );
 }
 
 /** ***************************************************************************
@@ -293,5 +303,11 @@ int MORSE_Comm_rank( int *rank )
  *****************************************************************************/
 int MORSE_GetThreadNbr( )
 {
-    return RUNTIME_get_thread_nbr();
+    MORSE_context_t *morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_GetThreadNbr()", "MORSE not initialized");
+        return -1;
+    }
+
+    return RUNTIME_thread_size( morse );
 }
diff --git a/control/descriptor.c b/control/descriptor.c
index 26a0d4747..bec07637c 100644
--- a/control/descriptor.c
+++ b/control/descriptor.c
@@ -110,7 +110,17 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz,
                                   int   (*get_blkldd) ( const MORSE_desc_t*, int      ),
                                   int   (*get_rankof) ( const MORSE_desc_t*, int, int ))
 {
+    MORSE_context_t *morse;
     MORSE_desc_t desc;
+
+    memset( &desc, 0, sizeof(MORSE_desc_t) );
+
+    morse = morse_context_self();
+    if (morse == NULL) {
+        morse_error("MORSE_Desc_Create", "MORSE not initialized");
+        return desc;
+    }
+
     // If one of the function get_* is NULL, we switch back to the default, like in morse_desc_init()
     desc.get_blkaddr = get_blkaddr ? get_blkaddr : morse_getaddr_ccrb;
     desc.get_blkldd  = get_blkldd  ? get_blkldd  : morse_getblkldd_ccrb;
@@ -144,7 +154,7 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz,
     desc.register_mat = 1;
     desc.ooc          = 0;
 
-    RUNTIME_comm_rank( &(desc.myrank) );
+    desc.myrank = RUNTIME_comm_rank( morse );
 
     // Grid size
     desc.p = p;
@@ -185,8 +195,6 @@ MORSE_desc_t morse_desc_init_user(MORSE_enum dtyp, int mb, int nb, int bsiz,
     desc.A12 = (size_t)(           desc.llm%mb)*(size_t)(desc.lln - desc.lln%nb) + desc.A21;
     desc.A22 = (size_t)(desc.llm - desc.llm%mb)*(size_t)(           desc.lln%nb) + desc.A12;
 
-    RUNTIME_desc_init( &desc );
-
     return desc;
 }
 
@@ -241,7 +249,8 @@ MORSE_desc_t* morse_desc_submatrix(MORSE_desc_t *descA, int i, int j, int m, int
     descB->mt = (m == 0) ? 0 : (descB->i+m-1)/mb - descB->i/mb + 1;
     descB->nt = (n == 0) ? 0 : (descB->j+n-1)/nb - descB->j/nb + 1;
 
-    RUNTIME_desc_submatrix( descB );
+    // Increase the number of occurences to avoid multiple free of runtime specific data structures.
+    descB->occurences++;
 
     return descB;
 }
@@ -301,7 +310,7 @@ int morse_desc_mat_alloc( MORSE_desc_t *desc )
 
     size_t size = (size_t)(desc->llm) * (size_t)(desc->lln)
         * (size_t)MORSE_Element_Size(desc->dtyp);
-    if ((desc->mat = RUNTIME_mat_alloc(size)) == NULL) {
+    if ((desc->mat = RUNTIME_malloc(size)) == NULL) {
         morse_error("morse_desc_mat_alloc", "malloc() failed");
         return MORSE_ERR_OUT_OF_RESOURCES;
     }
@@ -327,7 +336,7 @@ int morse_desc_mat_free( MORSE_desc_t *desc )
         size_t size = (size_t)(desc->llm) * (size_t)(desc->lln)
             * (size_t)MORSE_Element_Size(desc->dtyp);
 
-        RUNTIME_mat_free(desc->mat, size);
+        RUNTIME_free(desc->mat, size);
         desc->mat = NULL;
     }
     return MORSE_SUCCESS;
@@ -423,7 +432,7 @@ int MORSE_Desc_Create(MORSE_desc_t **descptr, void *mat, MORSE_enum dtyp, int mb
         size_t size = (size_t)(desc->llm) * (size_t)(desc->lln)
             * (size_t)MORSE_Element_Size(desc->dtyp);
 
-        if ((desc->mat = RUNTIME_mat_alloc(size)) == NULL) {
+        if ((desc->mat = RUNTIME_malloc(size)) == NULL) {
             morse_error("MORSE_Desc_Create", "malloc() failed");
             return MORSE_ERR_OUT_OF_RESOURCES;
         }
@@ -843,6 +852,6 @@ int MORSE_Desc_Getoncpu(MORSE_desc_t  *desc) {
  *
  *****************************************************************************/
 void MORSE_user_tag_size(int user_tag_width, int user_tag_sep) {
-    RUNTIME_user_tag_size(user_tag_width, user_tag_sep);
+    RUNTIME_comm_set_tag_sizes( user_tag_width, user_tag_sep );
     return;
 }
diff --git a/include/chameleon/morse_runtime.h b/include/chameleon/morse_runtime.h
index 051410897..dbcde462a 100644
--- a/include/chameleon/morse_runtime.h
+++ b/include/chameleon/morse_runtime.h
@@ -155,6 +155,15 @@ RUNTIME_resume( MORSE_context_t *ctxt );
 void
 RUNTIME_barrier( MORSE_context_t *ctxt );
 
+/**
+ * @brief Show the progress of the computations when enabled.
+ *
+ * @param[in] ctxt
+ *            The Chameleon context for which the context needs to be printed.
+ */
+void
+RUNTIME_progress( MORSE_context_t *ctxt );
+
 /**
  * @brief Get the rank of the current worker for the runtime.
  *
@@ -405,14 +414,21 @@ RUNTIME_desc_getoncpu( const MORSE_desc_t *desc );
  * This function is a asynchronous call that submit the data movement from
  * remote memory to the main memory. This call must be completed by a call to
  * RUNTIME_sequence_wait() to ensure that all data have been moved.
+ * Users should avoid to call this function as it sequentially moves back the
+ * data from outside the main memory to main memory, and should prefer
+ * RUNTIME_desc_getoncpu_async().
  *
  * @param[in] desc
  *            The descriptor to release.
  *
+ * @param[in] sequence
+ *            The sequence to which submit the data movements
+ *
  * @retval MORSE_SUCCESS on success
  */
 int
-RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc );
+RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc,
+                             MORSE_sequence_t   *sequence );
 
 /**
  * @brief Get the pointer to the data or the runtime handler associated to the
diff --git a/include/morse.h b/include/morse.h
index dbce6b279..6d745743f 100644
--- a/include/morse.h
+++ b/include/morse.h
@@ -84,8 +84,8 @@ int MORSE_Pause             (void);
 int MORSE_Resume            (void);
 int MORSE_Distributed_start (void);
 int MORSE_Distributed_stop  (void);
-int MORSE_Comm_size         (int *size);
-int MORSE_Comm_rank         (int *rank);
+int MORSE_Comm_size         (void);
+int MORSE_Comm_rank         (void);
 int MORSE_Lapack_to_Tile    (void *Af77, int LDA, MORSE_desc_t *A);
 int MORSE_Tile_to_Lapack    (MORSE_desc_t *A, void *Af77, int LDA);
 int MORSE_Distributed_start (void);
diff --git a/runtime/starpu/codelets/codelet_dataflush.c b/runtime/starpu/codelets/codelet_dataflush.c
index be96e164c..976c5dfca 100644
--- a/runtime/starpu/codelets/codelet_dataflush.c
+++ b/runtime/starpu/codelets/codelet_dataflush.c
@@ -56,8 +56,8 @@ int RUNTIME_desc_iscached(const MORSE_desc_t *A, int Am, int An)
 #endif
 #endif
 
-void MORSE_TASK_flush_data(const MORSE_option_t *options,
-                          const MORSE_desc_t *A, int Am, int An)
+void MORSE_TASK_flush_data( const MORSE_option_t *options,
+                            const MORSE_desc_t *A, int Am, int An )
 {
     (void)options;
 
diff --git a/runtime/starpu/control/runtime_async.c b/runtime/starpu/control/runtime_async.c
index 4d9b646ac..23efccf5f 100644
--- a/runtime/starpu/control/runtime_async.c
+++ b/runtime/starpu/control/runtime_async.c
@@ -28,7 +28,8 @@
 /*******************************************************************************
  *  Create a sequence
  **/
-int RUNTIME_sequence_create( MORSE_context_t *morse, MORSE_sequence_t *sequence )
+int RUNTIME_sequence_create( MORSE_context_t  *morse,
+                             MORSE_sequence_t *sequence )
 {
     (void)morse;
     (void)sequence;
@@ -38,60 +39,27 @@ int RUNTIME_sequence_create( MORSE_context_t *morse, MORSE_sequence_t *sequence
 /*******************************************************************************
  *  Destroy a sequence
  **/
-int RUNTIME_sequence_destroy( MORSE_context_t *morse, MORSE_sequence_t *sequence )
+int RUNTIME_sequence_destroy( MORSE_context_t  *morse,
+                              MORSE_sequence_t *sequence )
 {
     (void)morse;
     (void)sequence;
     return MORSE_SUCCESS;
 }
 
-// Defined in control/auxilliary.c
-extern void (*update_progress_callback)(int, int) ;
-
-// no progress indicator for algorithms faster than 'PROGRESS_MINIMUM_DURATION' seconds
-#define PROGRESS_MINIMUM_DURATION 10
-
-/*******************************************************************************
- *  Display a progress information when executing the tasks
- **/
-int RUNTIME_progress( MORSE_context_t *morse )
-{
-    int tasksLeft, current, timer = 0;
-    int max;
-
-#if defined(CHAMELEON_USE_MPI)
-    if ( morse->my_mpi_rank != 0 )
-        return MORSE_SUCCESS;
-#endif
-
-    max = starpu_task_nsubmitted();
-    if ( max == 0 )
-        return MORSE_SUCCESS;
-
-    //  update_progress_callback(0, max);
-    while ((tasksLeft = starpu_task_nsubmitted()) > 0) {
-        current = max - tasksLeft;
-        if (timer > PROGRESS_MINIMUM_DURATION)
-            update_progress_callback(current, max);
-        sleep(1);
-        timer++;
-    }
-    if (timer > PROGRESS_MINIMUM_DURATION)
-        update_progress_callback(max, max);
-
-    (void)morse;
-    return MORSE_SUCCESS;
-}
-
 /*******************************************************************************
  *  Wait for the completion of a sequence
  **/
-int RUNTIME_sequence_wait( MORSE_context_t *morse, MORSE_sequence_t *sequence )
+int RUNTIME_sequence_wait( MORSE_context_t  *morse,
+                           MORSE_sequence_t *sequence )
 {
     (void)morse;
     (void)sequence;
-    if (morse->progress_enabled)
+
+    if (morse->progress_enabled) {
         RUNTIME_progress(morse);
+    }
+
     starpu_task_wait_for_all();
 #if defined(CHAMELEON_USE_MPI)
     starpu_mpi_barrier(MPI_COMM_WORLD);
@@ -102,9 +70,12 @@ int RUNTIME_sequence_wait( MORSE_context_t *morse, MORSE_sequence_t *sequence )
 /*******************************************************************************
  *  Terminate a sequence
  **/
-void RUNTIME_sequence_flush( void *schedopt, MORSE_sequence_t *sequence, MORSE_request_t *request, int status)
+void RUNTIME_sequence_flush( MORSE_context_t  *morse,
+                             MORSE_sequence_t *sequence,
+                             MORSE_request_t  *request,
+                             int status )
 {
-    (void)schedopt;
+    (void)morse;
     sequence->request = request;
     sequence->status = status;
     request->status = status;
diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c
index a96a071e2..e14ef69c6 100644
--- a/runtime/starpu/control/runtime_context.c
+++ b/runtime/starpu/control/runtime_context.c
@@ -62,7 +62,6 @@ void RUNTIME_context_create( MORSE_context_t *morse )
 /*******************************************************************************
  *  Clean the context
  **/
-
 void RUNTIME_context_destroy( MORSE_context_t *morse )
 {
     /* StarPU was already initialized by an external library */
diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c
index bbf792875..aec18aa50 100644
--- a/runtime/starpu/control/runtime_control.c
+++ b/runtime/starpu/control/runtime_control.c
@@ -27,28 +27,13 @@
 #include <stdlib.h>
 #include "chameleon_starpu.h"
 
-#if defined(CHAMELEON_SIMULATION)
-# ifndef STARPU_SIMGRID
-#  error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support."
-# endif
-#else
-# ifdef STARPU_SIMGRID
-#  warning "Starpu was built with simgrid support. Better build Chameleon with simulation support (-DCHAMELEON_SIMULATION=YES)."
-# endif
-#endif
-/*******************************************************************************
- * Thread rank.
- **/
-int RUNTIME_rank(MORSE_context_t *morse)
-{
-    (void)morse;
-    return starpu_worker_get_id();
-}
-
 /*******************************************************************************
  *
  **/
-int RUNTIME_init_scheduler( MORSE_context_t *morse, int ncpus, int ncudas, int nthreads_per_worker)
+int RUNTIME_init( MORSE_context_t *morse,
+                  int ncpus,
+                  int ncudas,
+                  int nthreads_per_worker )
 {
     starpu_conf_t *conf = (starpu_conf_t*)(morse->schedopt);
     int hres = -1;
@@ -137,18 +122,19 @@ int RUNTIME_init_scheduler( MORSE_context_t *morse, int ncpus, int ncudas, int n
 /*******************************************************************************
  *
  */
-void RUNTIME_finalize_scheduler( MORSE_context_t *morse )
+void RUNTIME_finalize( MORSE_context_t *morse )
 {
     (void)morse;
 
     /* StarPU was already initialized by an external library */
-    if (morse->schedopt == NULL) {
+    if ( morse->schedopt == NULL ) {
         return;
     }
 
 #if defined(CHAMELEON_USE_MPI)
     starpu_mpi_shutdown();
 #endif
+
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
     starpu_cublas_shutdown();
 #endif
@@ -157,6 +143,27 @@ void RUNTIME_finalize_scheduler( MORSE_context_t *morse )
     return;
 }
 
+/*******************************************************************************
+ *  To suspend the processing of new tasks by workers
+ **/
+void RUNTIME_pause( MORSE_context_t *morse )
+{
+    (void)morse;
+    starpu_pause();
+    return;
+}
+
+/*******************************************************************************
+ *  This is the symmetrical call to RUNTIME_pause,
+ *  used to resume the workers polling for new tasks.
+ **/
+void RUNTIME_resume( MORSE_context_t *morse )
+{
+    (void)morse;
+    starpu_resume();
+    return;
+}
+
 /*******************************************************************************
  *  Busy-waiting barrier
  **/
@@ -169,84 +176,102 @@ void RUNTIME_barrier( MORSE_context_t *morse )
 #endif
 }
 
+// Defined in control/auxilliary.c
+extern void (*update_progress_callback)(int, int);
+
+// no progress indicator for algorithms faster than 'PROGRESS_MINIMUM_DURATION' seconds
+#define PROGRESS_MINIMUM_DURATION 10
+
 /*******************************************************************************
- *  Set iteration numbers for traces
+ *  Display a progress information when executing the tasks
  **/
-void RUNTIME_iteration_push( MORSE_context_t *morse, unsigned long iteration )
+void RUNTIME_progress( MORSE_context_t *morse )
 {
-    (void)morse;
-#if defined(HAVE_STARPU_ITERATION_PUSH)
-    starpu_iteration_push(iteration);
+    int tasksLeft, current, timer = 0;
+    int max;
+
+#if defined(CHAMELEON_USE_MPI)
+    if ( morse->my_mpi_rank != 0 ) {
+        return;
+    }
 #endif
-}
 
-void RUNTIME_iteration_pop( MORSE_context_t *morse )
-{
+    max = starpu_task_nsubmitted();
+    if ( max == 0 ) {
+        return;
+    }
+
+    //  update_progress_callback(0, max);
+    while ((tasksLeft = starpu_task_nsubmitted()) > 0) {
+        current = max - tasksLeft;
+        if (timer > PROGRESS_MINIMUM_DURATION) {
+            update_progress_callback(current, max);
+        }
+        sleep(1);
+        timer++;
+    }
+    if (timer > PROGRESS_MINIMUM_DURATION) {
+        update_progress_callback(max, max);
+    }
+
     (void)morse;
-#if defined(HAVE_STARPU_ITERATION_PUSH)
-    starpu_iteration_pop();
-#endif
+    return;
 }
 
 /*******************************************************************************
- *  To suspend the processing of new tasks by workers
+ * Thread rank.
  **/
-void RUNTIME_pause( MORSE_context_t *morse )
+int RUNTIME_thread_rank( MORSE_context_t *morse )
 {
     (void)morse;
-    starpu_pause();
-    return;
+    return starpu_worker_get_id();
 }
 
 /*******************************************************************************
- *  This is the symmetrical call to RUNTIME_pause,
- *  used to resume the workers polling for new tasks.
+ * Thread rank.
  **/
-void RUNTIME_resume( MORSE_context_t *morse )
+int RUNTIME_thread_size( MORSE_context_t *morse )
 {
     (void)morse;
-    starpu_resume();
-    return;
+    return starpu_worker_get_count_by_type( STARPU_CPU_WORKER );
 }
 
 /*******************************************************************************
- *  This returns the rank of this process
+ *  The process rank
  **/
-void RUNTIME_comm_rank( int *rank )
+int RUNTIME_comm_rank( MORSE_context_t *morse )
 {
+    int rank;
 #if defined(CHAMELEON_USE_MPI)
 #  if defined(HAVE_STARPU_MPI_COMM_RANK)
-    starpu_mpi_comm_rank(MPI_COMM_WORLD, rank);
+    starpu_mpi_comm_rank( MPI_COMM_WORLD, &rank );
 #  else
-    MPI_Comm_rank(MPI_COMM_WORLD, rank);
+    MPI_Comm_rank( MPI_COMM_WORLD, &rank );
 #  endif
 #else
-    *rank = 0;
+    rank = 0;
 #endif
-    return;
+
+    (void)morse;
+    return rank;
 }
 
 /*******************************************************************************
  *  This returns the size of the distributed computation
  **/
-void RUNTIME_comm_size( int *size )
+int RUNTIME_comm_size( MORSE_context_t *morse )
 {
+    int size;
 #if defined(CHAMELEON_USE_MPI)
 #  if defined(HAVE_STARPU_MPI_COMM_RANK)
-    starpu_mpi_comm_size(MPI_COMM_WORLD, size);
+    starpu_mpi_comm_size( MPI_COMM_WORLD, &size );
 #  else
-    MPI_Comm_size(MPI_COMM_WORLD, size);
+    MPI_Comm_size( MPI_COMM_WORLD, &size );
 #  endif
 #else
-    *size = 1;
+    size = 1;
 #endif
-    return;
-}
 
-/*******************************************************************************
- *  This returns the number of workers
- **/
-int RUNTIME_get_thread_nbr()
-{
-    return starpu_worker_get_count_by_type( STARPU_CPU_WORKER );
+    (void)morse;
+    return size;
 }
diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c
index b14888842..a1e434e8e 100644
--- a/runtime/starpu/control/runtime_descriptor.c
+++ b/runtime/starpu/control/runtime_descriptor.c
@@ -25,6 +25,9 @@
 #include <unistd.h>
 #include "chameleon_starpu.h"
 
+/*******************************************************************************
+ *  Set the tag sizes
+ **/
 #if defined(CHAMELEON_USE_MPI)
 
 /* Take 24 bits for the tile id, and 7 bits for descriptor id.
@@ -44,13 +47,9 @@ static int _tag_mpi_initialized_ = 0;
 
 #endif
 
-#ifdef STARPU_MALLOC_SIMULATION_FOLDED
-#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED
-#else
-#define FOLDED 0
-#endif
-
-void RUNTIME_user_tag_size( int user_tag_width, int user_tag_sep ) {
+void RUNTIME_comm_set_tag_sizes( int user_tag_width,
+                                 int user_tag_sep )
+{
 #if defined(CHAMELEON_USE_MPI)
     if (_tag_mpi_initialized_ == 0) {
         tag_width = user_tag_width;
@@ -63,29 +62,43 @@ void RUNTIME_user_tag_size( int user_tag_width, int user_tag_sep ) {
     (void)user_tag_width; (void)user_tag_sep;
 }
 
+/*******************************************************************************
+ *  Malloc/Free of the data
+ **/
+#ifdef STARPU_MALLOC_SIMULATION_FOLDED
+#define FOLDED STARPU_MALLOC_SIMULATION_FOLDED
+#else
+#define FOLDED 0
+#endif
 
-void *RUNTIME_mat_alloc( size_t size )
+void *RUNTIME_malloc( size_t size )
 {
 #if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
     return (void*) 1;
 #else
-    void *mat;
+    void *ptr;
 
-    if (starpu_malloc_flags(&mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0)
+    if (starpu_malloc_flags(&ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT) != 0) {
         return NULL;
-    return mat;
+    }
+    return ptr;
 #endif
 }
 
-void RUNTIME_mat_free( void *mat, size_t size )
+void RUNTIME_free( void  *ptr,
+                   size_t size )
 {
 #if defined(CHAMELEON_SIMULATION) && !defined(STARPU_MALLOC_SIMULATION_FOLDED) && !defined(CHAMELEON_USE_MPI)
+    (void)ptr; (void)size;
     return;
 #else
-    starpu_free_flags(mat, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
+    starpu_free_flags(ptr, size, STARPU_MALLOC_PINNED|FOLDED|STARPU_MALLOC_COUNT);
 #endif
 }
 
+/*******************************************************************************
+ *  Create data descriptor
+ **/
 void RUNTIME_desc_create( MORSE_desc_t *desc )
 {
     int64_t lmt = desc->lmt;
@@ -104,34 +117,34 @@ void RUNTIME_desc_create( MORSE_desc_t *desc )
     tiles = (starpu_data_handle_t*)(desc->schedopt);
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
-    if (desc->use_mat == 1 && desc->register_mat == 1){
-        /*
-         * Register allocated memory as CUDA pinned memory
-         */
-        {
-            int64_t eltsze = MORSE_Element_Size(desc->dtyp);
-            size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze;
-            cudaError_t rc;
+    /*
+     * Register allocated memory as CUDA pinned memory
+     */
+    if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
+    {
+        int64_t eltsze = MORSE_Element_Size(desc->dtyp);
+        size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze;
+        cudaError_t rc;
 
-            /* Register the matrix as pinned memory */
-            rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable );
-            if ( rc != cudaSuccess )
-            {
-                morse_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc ));
-            }
+        /* Register the matrix as pinned memory */
+        rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable );
+        if ( rc != cudaSuccess )
+        {
+            morse_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc ));
         }
     }
 #endif
-    if (desc->ooc) {
-        int lastmm = desc->lm - (desc->lmt-1) * desc->mb;
-        int lastnn = desc->ln - (desc->lnt-1) * desc->nb;
-        int64_t eltsze = MORSE_Element_Size(desc->dtyp);
-        int pagesize = getpagesize();
 
-        if ((desc->mb * desc->nb * eltsze) % pagesize != 0
-            || (lastmm   * desc->nb * eltsze) % pagesize != 0
-            || (desc->mb * lastnn   * eltsze) % pagesize != 0
-            || (lastmm   * lastnn   * eltsze) % pagesize != 0)
+    if (desc->ooc) {
+        int     lastmm   = desc->lm - (desc->lmt-1) * desc->mb;
+        int     lastnn   = desc->ln - (desc->lnt-1) * desc->nb;
+        int64_t eltsze   = MORSE_Element_Size(desc->dtyp);
+        int     pagesize = getpagesize();
+
+        if ( ((desc->mb * desc->nb * eltsze) % pagesize != 0) ||
+             ((lastmm   * desc->nb * eltsze) % pagesize != 0) ||
+             ((desc->mb * lastnn   * eltsze) % pagesize != 0) ||
+             ((lastmm   * lastnn   * eltsze) % pagesize != 0) )
         {
             morse_error("RUNTIME_desc_create", "Matrix and tile size not suitable for out-of-core: all tiles have to be multiples of 4096. Tip : choose 'n' and 'nb' as both multiples of 32.");
             return;
@@ -153,8 +166,9 @@ void RUNTIME_desc_create( MORSE_desc_t *desc )
                 morse_error("RUNTIME_desc_create", "MPI_TAG_UB not known by MPI");
             }
 
-            while ( ((uintptr_t)((1UL<<tag_width) - 1) > (uintptr_t)(*tag_ub) )
-                    && (tag_width >= TAG_WIDTH_MIN) ) {
+            while ( ((uintptr_t)((1UL<<tag_width) - 1) > (uintptr_t)(*tag_ub) ) &&
+                    (tag_width >= TAG_WIDTH_MIN) )
+            {
                 tag_width--;
                 tag_sep--;
             }
@@ -183,6 +197,9 @@ void RUNTIME_desc_create( MORSE_desc_t *desc )
 #endif
 }
 
+/*******************************************************************************
+ *  Destroy data descriptor
+ **/
 void RUNTIME_desc_destroy( MORSE_desc_t *desc )
 {
     desc->occurences--;
@@ -197,7 +214,7 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc )
         int lnt = desc->lnt;
         int m, n;
 
-        for (n = 0; n < lnt; n++)
+        for (n = 0; n < lnt; n++) {
             for (m = 0; m < lmt; m++)
             {
                 if (*handle == NULL)
@@ -205,13 +222,14 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc )
                     handle++;
                     continue;
                 }
-                //printf("\nUnregister %d %d %d", MORSE_My_Mpi_Rank(), m, n);
                 starpu_data_unregister(*handle);
                 handle++;
             }
+        }
 
 #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
-        if (desc->use_mat == 1 && desc->register_mat == 1){
+        if ( (desc->use_mat == 1) && (desc->register_mat == 1) )
+        {
             /* Unmap the pinned memory associated to the matrix */
             if (cudaHostUnregister(desc->mat) != cudaSuccess)
             {
@@ -226,28 +244,43 @@ void RUNTIME_desc_destroy( MORSE_desc_t *desc )
     }
 }
 
-void RUNTIME_desc_init( MORSE_desc_t *desc )
+/*******************************************************************************
+ *  Acquire data
+ **/
+int RUNTIME_desc_acquire( const MORSE_desc_t *desc )
 {
-    (void)desc;
-    return;
-}
+    starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
+    int lmt = desc->lmt;
+    int lnt = desc->lnt;
+    int m, n;
 
-void RUNTIME_desc_submatrix( MORSE_desc_t *desc )
-{
-    desc->occurences++;
-    return;
+    for (n = 0; n < lnt; n++) {
+        for (m = 0; m < lmt; m++)
+        {
+            if ( (*handle == NULL) ||
+                 !morse_desc_islocal( desc, m, n ) )
+            {
+                handle++;
+                continue;
+            }
+            starpu_data_acquire(*handle, STARPU_R);
+            handle++;
+        }
+    }
+    return MORSE_SUCCESS;
 }
 
-/* TODO: Acquire/Release/GetonCPU need to be studied carefully and fixed
- * because we are not using them correctly */
-int RUNTIME_desc_acquire( MORSE_desc_t *desc )
+/*******************************************************************************
+ *  Release data
+ **/
+int RUNTIME_desc_release( const MORSE_desc_t *desc )
 {
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
     int lmt = desc->lmt;
     int lnt = desc->lnt;
     int m, n;
 
-    for (n = 0; n < lnt; n++)
+    for (n = 0; n < lnt; n++) {
         for (m = 0; m < lmt; m++)
         {
             if ( (*handle == NULL) ||
@@ -256,20 +289,31 @@ int RUNTIME_desc_acquire( MORSE_desc_t *desc )
                 handle++;
                 continue;
             }
-            starpu_data_acquire(*handle, STARPU_R);
+            starpu_data_release(*handle);
             handle++;
         }
+    }
     return MORSE_SUCCESS;
 }
 
-int RUNTIME_desc_release( MORSE_desc_t *desc )
+/*******************************************************************************
+ *  Get data on cpu - Synchronous call
+ **/
+int RUNTIME_desc_getoncpu( const MORSE_desc_t *desc )
 {
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
     int lmt = desc->lmt;
     int lnt = desc->lnt;
     int m, n;
 
-    for (n = 0; n < lnt; n++)
+    if ( desc->ooc ) {
+        /* May not even fit */
+        morse_warning( "RUNTIME_desc_getoncpu(StarPU)",
+                       "Try to get an out-of-core matrix on main memory. Cancelled as it might not fit" );
+        return MORSE_SUCCESS;
+    }
+
+    for (n = 0; n < lnt; n++) {
         for (m = 0; m < lmt; m++)
         {
             if ( (*handle == NULL) ||
@@ -278,31 +322,34 @@ int RUNTIME_desc_release( MORSE_desc_t *desc )
                 handle++;
                 continue;
             }
+
+            starpu_data_acquire(*handle, STARPU_R);
             starpu_data_release(*handle);
             handle++;
         }
+    }
     return MORSE_SUCCESS;
 }
 
-/**
- * For older revision of StarPU, STARPU_MAIN_RAM is not defined
- */
-#ifndef STARPU_MAIN_RAM
-#define STARPU_MAIN_RAM 0
-#endif
-
-int RUNTIME_desc_getoncpu( MORSE_desc_t *desc )
+/*******************************************************************************
+ *  Get data on cpu - Asynchronous call
+ **/
+int RUNTIME_desc_getoncpu_async( const MORSE_desc_t *desc,
+                                 MORSE_sequence_t   *sequence )
 {
     starpu_data_handle_t *handle = (starpu_data_handle_t*)(desc->schedopt);
     int lmt = desc->lmt;
     int lnt = desc->lnt;
     int m, n;
 
-    if (desc->ooc)
+    if ( desc->ooc ) {
         /* May not even fit */
+        morse_warning( "RUNTIME_desc_getoncpu_async(StarPU)",
+                       "Try to get an out-of-core matrix on main memory. Cancelled as it might not fit" );
         return MORSE_SUCCESS;
+    }
 
-    for (n = 0; n < lnt; n++)
+    for (n = 0; n < lnt; n++) {
         for (m = 0; m < lmt; m++)
         {
             if ( (*handle == NULL) ||
@@ -312,14 +359,25 @@ int RUNTIME_desc_getoncpu( MORSE_desc_t *desc )
                 continue;
             }
 
-            starpu_data_acquire(*handle, STARPU_R);
-            starpu_data_release(*handle);
+            starpu_data_acquire_cb( *handle, STARPU_R,
+                                    (void (*)(void*))&starpu_data_release, *handle );
             handle++;
         }
+    }
 
+    (void)sequence;
     return MORSE_SUCCESS;
 }
 
+/*******************************************************************************
+ *  Get data addr
+ **/
+
+/* For older revision of StarPU, STARPU_MAIN_RAM is not defined */
+#ifndef STARPU_MAIN_RAM
+#define STARPU_MAIN_RAM 0
+#endif
+
 void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n )
 {
     int64_t im = m + (desc->i / desc->mb);
@@ -344,12 +402,12 @@ void *RUNTIME_desc_getaddr( const MORSE_desc_t *desc, int m, int n )
             }
         }
 
-        starpu_matrix_data_register(ptrtile, home_node, (uintptr_t) user_ptr,
-                                    BLKLDD(desc, im),
-                                    tempmm, tempnn, eltsze);
+        starpu_matrix_data_register( ptrtile, home_node, (uintptr_t) user_ptr,
+                                     BLKLDD(desc, im),
+                                     tempmm, tempnn, eltsze );
 
 #ifdef HAVE_STARPU_DATA_SET_COORDINATES
-        starpu_data_set_coordinates(*ptrtile, 2, m, n);
+        starpu_data_set_coordinates( *ptrtile, 2, m, n );
 #endif
 
 #if defined(CHAMELEON_USE_MPI)
diff --git a/runtime/starpu/control/runtime_profiling.c b/runtime/starpu/control/runtime_profiling.c
index 4b9c700d5..e3021a766 100644
--- a/runtime/starpu/control/runtime_profiling.c
+++ b/runtime/starpu/control/runtime_profiling.c
@@ -40,6 +40,25 @@ double RUNTIME_get_time(){
     return starpu_timing_now()*1e-6;
 }
 
+/*******************************************************************************
+ *  Set iteration numbers for traces
+ **/
+void RUNTIME_iteration_push( MORSE_context_t *morse, unsigned long iteration )
+{
+    (void)morse;
+#if defined(HAVE_STARPU_ITERATION_PUSH)
+    starpu_iteration_push(iteration);
+#endif
+}
+
+void RUNTIME_iteration_pop( MORSE_context_t *morse )
+{
+    (void)morse;
+#if defined(HAVE_STARPU_ITERATION_PUSH)
+    starpu_iteration_pop();
+#endif
+}
+
 void RUNTIME_start_profiling(){
 #if defined(HAVE_STARPU_FXT_PROFILING)
 	starpu_fxt_start_profiling();
diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in
index 5c24359b8..82a71f96b 100644
--- a/runtime/starpu/include/chameleon_starpu.h.in
+++ b/runtime/starpu/include/chameleon_starpu.h.in
@@ -59,6 +59,16 @@
 #endif
 #endif
 
+#if defined(CHAMELEON_SIMULATION)
+# if !defined(STARPU_SIMGRID)
+#  error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support."
+# endif
+#else
+# if defined(STARPU_SIMGRID)
+#  warning "Starpu was built with simgrid support. Better build Chameleon with simulation support (-DCHAMELEON_SIMULATION=YES)."
+# endif
+#endif
+
 #include "control/common.h"
 #include "runtime_codelets.h"
 #include "runtime_profiling.h"
-- 
GitLab