From 0cc694735ddcd0645e4c4a2c23dd2ee3a5d2934d Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 12 Mar 2024 14:40:46 +0100
Subject: [PATCH] parsec/map: Fix the map codelet to always use a tile
 interface and do not enforce the user to check for the runtime used in the
 map functions.

---
 compute/pzlatms.c                     | 28 +++-------
 compute/zprint.c                      | 26 ++-------
 runtime/parsec/codelets/codelet_map.c | 78 ++++++++++++++++++++++-----
 3 files changed, 75 insertions(+), 57 deletions(-)

diff --git a/compute/pzlatms.c b/compute/pzlatms.c
index 8e6ba0f5f..e8daf897e 100644
--- a/compute/pzlatms.c
+++ b/compute/pzlatms.c
@@ -13,7 +13,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Lionel Eyraud-Dubois
- * @date 2024-03-11
+ * @date 2024-03-14
  * @precisions normal z -> s d c
  *
  */
@@ -26,39 +26,26 @@
 
 #define A(m, n) A,  m,  n
 
-/*
- * Static variable to know how to handle the data within the kernel
- * This assumes that only one runtime is enabled at a time.
- */
-static RUNTIME_id_t zlatms_runtime_id = RUNTIME_SCHED_STARPU;
-
 static inline int
 zlaset_diag_cpu( void *op_args,
                  cham_uplo_t uplo, int m, int n, int ndata,
                  const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
 {
-    CHAMELEON_Complex64_t *A;
-    const double *D = (const double *)op_args;
+    const double          *D = (const double *)op_args;
+    CHAMELEON_Complex64_t *A = CHAM_tile_get_ptr( tileA );
 
     int tempmm = m == descA->mt-1 ? descA->m-m*descA->mb : descA->mb;
     int tempnn = n == descA->nt-1 ? descA->n-n*descA->nb : descA->nb;
-    int minmn = chameleon_min( tempmm, tempnn );
-    int lda, i;
+    int minmn  = chameleon_min( tempmm, tempnn );
+    int lda    = tileA->ld;
+    int i;
 
     if ( ndata > 1 ) {
         fprintf( stderr, "zlaset_diag_cpu: supports only one piece of data and %d have been given\n", ndata );
     }
 
-    if ( zlatms_runtime_id == RUNTIME_SCHED_PARSEC ) {
-        A   = (CHAMELEON_Complex64_t*)tileA;
-        lda = descA->get_blkldd( descA, m );
-    }
-    else {
-        A   = tileA->mat;
-        lda = tileA->ld;
-    }
-
     assert( m == n );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
 
     /* Shift to the values corresponding to the tile */
     D += m * descA->mb;
@@ -104,7 +91,6 @@ void chameleon_pzlatms( cham_dist_t idist, unsigned long long int seed, cham_sym
         return;
     }
     ib = CHAMELEON_IB;
-    zlatms_runtime_id = chamctxt->scheduler;
 
     RUNTIME_options_init(&options, chamctxt, sequence, request);
 
diff --git a/compute/zprint.c b/compute/zprint.c
index cc2be4076..e17a406a1 100644
--- a/compute/zprint.c
+++ b/compute/zprint.c
@@ -12,7 +12,7 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Matthieu Kuhn
- * @date 2024-03-11
+ * @date 2024-03-14
  * @precisions normal z -> s d c
  *
  */
@@ -21,12 +21,6 @@
 #include <coreblas/coreblas_z.h>
 #endif
 
-/*
- * Static variable to know how to handle the data within the kernel
- * This assumes that only one runtime is enabled at a time.
- */
-static RUNTIME_id_t zprint_runtime_id = RUNTIME_SCHED_STARPU;
-
 struct zprint_args_s {
     FILE       *file;
     const char *header;
@@ -37,25 +31,17 @@ zprint_cpu( void *op_args,
             cham_uplo_t uplo, int m, int n, int ndata,
             const CHAM_desc_t *descA, CHAM_tile_t *tileA, ... )
 {
-    CHAMELEON_Complex64_t *A;
     struct zprint_args_s  *options = (struct zprint_args_s *)op_args;
+    CHAMELEON_Complex64_t *A = CHAM_tile_get_ptr( tileA );
 
     int tempmm = m == descA->mt-1 ? descA->m-m*descA->mb : descA->mb;
     int tempnn = n == descA->nt-1 ? descA->n-n*descA->nb : descA->nb;
-    int lda;
+    int lda    = tileA->ld;
 
     if ( ndata > 1 ) {
         fprintf( stderr, "zprint_cpu: supports only one piece of data and %d have been given\n", ndata );
     }
-
-    if ( zprint_runtime_id == RUNTIME_SCHED_PARSEC ) {
-        A   = (CHAMELEON_Complex64_t*)tileA;
-        lda = descA->get_blkldd( descA, m );
-    }
-    else {
-        A   = CHAM_tile_get_ptr( tileA );
-        lda = tileA->ld;
-    }
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
 
 #if !defined(CHAMELEON_SIMULATION)
     CORE_zprint( options->file, options->header, uplo,
@@ -162,8 +148,6 @@ int CHAMELEON_zprint( FILE *file, const char *header,
                          A, NB, NB, LDA, N, M, N, sequence, &request );
 
     /* Call the tile interface */
-    zprint_runtime_id = chamctxt->scheduler;
-
     data.access = ChamR;
     data.desc   = &descAt;
     chameleon_pmap( uplo, 1, &data, &zprint_map, &options, sequence, &request );
@@ -230,8 +214,6 @@ int CHAMELEON_zprint_Tile( FILE *file, const char *header,
     }
     chameleon_sequence_create( chamctxt, &sequence );
 
-    zprint_runtime_id = chamctxt->scheduler;
-
     data.access = ChamR;
     data.desc   = A;
 
diff --git a/runtime/parsec/codelets/codelet_map.c b/runtime/parsec/codelets/codelet_map.c
index 12a701bf3..d14ee9996 100644
--- a/runtime/parsec/codelets/codelet_map.c
+++ b/runtime/parsec/codelets/codelet_map.c
@@ -11,7 +11,7 @@
  *
  * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2024-03-11
+ * @date 2024-03-14
  *
  */
 #include "chameleon_parsec.h"
@@ -30,11 +30,21 @@ CORE_map_one_parsec( parsec_execution_stream_t *context,
                      parsec_task_t             *this_task )
 {
     struct parsec_map_args_s *pargs = NULL;
-    CHAM_tile_t              *tileA;
+    const CHAM_desc_t        *descA;
+    CHAM_tile_t               tileA;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &(tileA.mat) );
+
+    descA = pargs->desc[0];
+    tileA.rank    = 0;
+    tileA.m       = (pargs->m == (descA->mt-1)) ? (descA->m - pargs->m * descA->mb) : descA->mb;
+    tileA.n       = (pargs->n == (descA->nt-1)) ? (descA->n - pargs->n * descA->nb) : descA->nb;
+    tileA.ld      = descA->get_blkldd( descA, pargs->m );
+    tileA.format  = CHAMELEON_TILE_FULLRANK;
+    tileA.flttype = descA->dtyp;
 
-    parsec_dtd_unpack_args( this_task, &pargs, &tileA );
     pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 1,
-                             pargs->desc[0], tileA );
+                             descA, &tileA );
 
     free( pargs );
 }
@@ -44,12 +54,29 @@ CORE_map_two_parsec( parsec_execution_stream_t *context,
                      parsec_task_t             *this_task )
 {
     struct parsec_map_args_s *pargs = NULL;
-    CHAM_tile_t              *tileA;
-    CHAM_tile_t              *tileB;
+    const CHAM_desc_t        *descA, *descB;
+    CHAM_tile_t               tileA,  tileB;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &(tileA.mat), &(tileB.mat) );
+
+    descA = pargs->desc[0];
+    tileA.rank    = 0;
+    tileA.m       = (pargs->m == (descA->mt-1)) ? (descA->m - pargs->m * descA->mb) : descA->mb;
+    tileA.n       = (pargs->n == (descA->nt-1)) ? (descA->n - pargs->n * descA->nb) : descA->nb;
+    tileA.ld      = descA->get_blkldd( descA, pargs->m );
+    tileA.format  = CHAMELEON_TILE_FULLRANK;
+    tileA.flttype = descA->dtyp;
+
+    descB = pargs->desc[1];
+    tileB.rank    = 0;
+    tileB.m       = (pargs->m == (descB->mt-1)) ? (descB->m - pargs->m * descB->mb) : descB->mb;
+    tileB.n       = (pargs->n == (descB->nt-1)) ? (descB->n - pargs->n * descB->nb) : descB->nb;
+    tileB.ld      = descB->get_blkldd( descB, pargs->m );
+    tileB.format  = CHAMELEON_TILE_FULLRANK;
+    tileB.flttype = descB->dtyp;
 
-    parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB );
     pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 2,
-                             pargs->desc[0], tileA, pargs->desc[1], tileB );
+                             descA, &tileA, descB, &tileB );
 
     free( pargs );
 }
@@ -59,14 +86,37 @@ CORE_map_three_parsec( parsec_execution_stream_t *context,
                        parsec_task_t             *this_task )
 {
     struct parsec_map_args_s *pargs = NULL;
-    CHAM_tile_t              *tileA;
-    CHAM_tile_t              *tileB;
-    CHAM_tile_t              *tileC;
+    const CHAM_desc_t        *descA, *descB, *descC;
+    CHAM_tile_t               tileA,  tileB,  tileC;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &(tileA.mat), &(tileB.mat), &(tileC.mat) );
+
+    descA = pargs->desc[0];
+    tileA.rank    = 0;
+    tileA.m       = (pargs->m == (descA->mt-1)) ? (descA->m - pargs->m * descA->mb) : descA->mb;
+    tileA.n       = (pargs->n == (descA->nt-1)) ? (descA->n - pargs->n * descA->nb) : descA->nb;
+    tileA.ld      = descA->get_blkldd( descA, pargs->m );
+    tileA.format  = CHAMELEON_TILE_FULLRANK;
+    tileA.flttype = descA->dtyp;
+
+    descB = pargs->desc[1];
+    tileB.rank    = 0;
+    tileB.m       = (pargs->m == (descB->mt-1)) ? (descB->m - pargs->m * descB->mb) : descB->mb;
+    tileB.n       = (pargs->n == (descB->nt-1)) ? (descB->n - pargs->n * descB->nb) : descB->nb;
+    tileB.ld      = descB->get_blkldd( descB, pargs->m );
+    tileB.format  = CHAMELEON_TILE_FULLRANK;
+    tileB.flttype = descB->dtyp;
+
+    descC = pargs->desc[2];
+    tileC.rank    = 0;
+    tileC.m       = (pargs->m == (descC->mt-1)) ? (descC->m - pargs->m * descC->mb) : descC->mb;
+    tileC.n       = (pargs->n == (descC->nt-1)) ? (descC->n - pargs->n * descC->nb) : descC->nb;
+    tileC.ld      = descC->get_blkldd( descC, pargs->m );
+    tileC.format  = CHAMELEON_TILE_FULLRANK;
+    tileC.flttype = descC->dtyp;
 
-    parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB, &tileC );
     pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 3,
-                             pargs->desc[0], tileA, pargs->desc[1], tileB,
-                             pargs->desc[2], tileC );
+                             descA, &tileA, descB, &tileB, descC, &tileC );
 
     free( pargs );
 }
-- 
GitLab