diff --git a/include/chameleon/tasks.h b/include/chameleon/tasks.h
index 80efe7dd913952d56261b1454e9cb8b850d99c63..b17e6bc32a8bd6b200e0122e4ac2131c12828cc4 100644
--- a/include/chameleon/tasks.h
+++ b/include/chameleon/tasks.h
@@ -16,7 +16,7 @@
  * @author Cedric Augonnet
  * @author Florent Pruvost
  * @author Matthieu Kuhn
- * @date 2023-08-31
+ * @date 2024-03-11
  *
  */
 #ifndef _chameleon_tasks_h_
@@ -97,9 +97,49 @@ typedef int (*cham_unary_operator_t)( const CHAM_desc_t *desc,
                                       cham_uplo_t uplo, int m, int n,
                                       CHAM_tile_t *data, void *op_args );
 
+typedef int (*cham_map_cpu_fct_t)( void *args, cham_uplo_t uplo, int m, int n, int ndata,
+                                   const CHAM_desc_t *desc, CHAM_tile_t *tile, ... );
+
+#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION)
+#include "gpucublas.h"
+typedef int (*cham_map_cuda_fct_t)( cublasHandle_t handle, void *args,
+                                    cham_uplo_t uplo, int m, int n, int ndata,
+                                    const CHAM_desc_t *desc, CHAM_tile_t *tile, ... );
+#else
+typedef void *cham_map_cuda_fct_t;
+#endif
+
+#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION)
+#include "gpuhipblas.h"
+typedef int (*cham_map_hip_fct_t)( hipblasHandle_t handle, void *args,
+                                   cham_uplo_t uplo, int m, int n, int ndata,
+                                   const CHAM_desc_t *desc, CHAM_tile_t *tile, ... );
+#else
+typedef void *cham_map_hip_fct_t;
+#endif
+
+/**
+ * @brief Structure to store the operator functions on any architecture
+ */
+typedef struct cham_map_operator_s {
+    const char         *name;     /**< Name of the operator to be used in debug/tracing mode */
+    cham_map_cpu_fct_t  cpufunc;  /**< Pointer to the CPU function of the operator           */
+    cham_map_cuda_fct_t cudafunc; /**< Pointer to the CUDA/cuBLAS function of the operator   */
+    cham_map_hip_fct_t  hipfunc;  /**< Pointer to the HIP function of the operator           */
+} cham_map_operator_t;
+
+/**
+ * @brief Structure to store the data information in the map operation
+ */
+typedef struct cham_map_data_s {
+    cham_access_t      access; /**< Access type to the descriptor. Must be one of ChamR, ChamW, ChamRW. */
+    const CHAM_desc_t *desc;   /**< Descriptor in which the data is taken to apply the map operation.   */
+} cham_map_data_t;
+
 void INSERT_TASK_map( const RUNTIME_option_t *options,
-                      cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
-                      cham_unary_operator_t op_fct, void *op_args );
+                      cham_uplo_t uplo, int m, int n,
+                      int ndata, cham_map_data_t *data,
+                      cham_map_operator_t *op_fcts, void *op_args );
 
 void INSERT_TASK_gemm( const RUNTIME_option_t *options,
                        cham_trans_t transA, cham_trans_t transB,
diff --git a/runtime/openmp/codelets/codelet_map.c b/runtime/openmp/codelets/codelet_map.c
index bea91ea906a57110d9c0dc262ccad74d813845d8..8bbe7742b10224f1428c8d76edcc040af9676d3b 100644
--- a/runtime/openmp/codelets/codelet_map.c
+++ b/runtime/openmp/codelets/codelet_map.c
@@ -9,43 +9,1025 @@
  *
  * @brief Chameleon map OpenMP codelet
  *
- * @version 1.2.0
- * @author Philippe Virouleau
+ * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2022-02-22
+ * @date 2024-03-11
  *
  */
 #include "chameleon_openmp.h"
 
-void INSERT_TASK_map( const RUNTIME_option_t *options,
-                      cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
-                      cham_unary_operator_t op_fct, void *op_args )
+typedef void (*omp_map_fct_t)( cham_uplo_t uplo, int m, int n, int ndata,
+                               cham_map_data_t     *data,
+                               cham_map_operator_t *op_fcts,
+                               void                *op_args );
+
+/*
+ * @brief Open map function type with one tile
+ */
+static inline void omp_map_one_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                  cham_map_data_t     *data,
+                                  cham_map_operator_t *op_fcts,
+                                  void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+
+    assert( ndata == 1 );
+    assert( A->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA );
+    }
+}
+
+static inline void omp_map_one_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                  cham_map_data_t     *data,
+                                  cham_map_operator_t *op_fcts,
+                                  void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+
+    assert( ndata == 1 );
+    assert( A->access == ChamW );
+
+#pragma omp task depend( out:tileA[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA );
+    }
+}
+
+static inline void omp_map_one_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                   cham_map_data_t     *data,
+                                   cham_map_operator_t *op_fcts,
+                                   void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+
+    assert( ndata == 1 );
+    assert( A->access == ChamRW );
+
+#pragma omp task depend( in:tileA[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 1, descA, tileA );
+    }
+}
+
+/*
+ * @brief Open map function type with two tiles
+ */
+static inline void omp_map_two_R_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                    cham_map_data_t     *data,
+                                    cham_map_operator_t *op_fcts,
+                                    void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_R_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                    cham_map_data_t     *data,
+                                    cham_map_operator_t *op_fcts,
+                                    void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamW );
+
+#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_R_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                     cham_map_data_t     *data,
+                                     cham_map_operator_t *op_fcts,
+                                     void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamRW );
+
+#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_W_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                    cham_map_data_t     *data,
+                                    cham_map_operator_t *op_fcts,
+                                    void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamR );
+
+#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_W_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                    cham_map_data_t     *data,
+                                    cham_map_operator_t *op_fcts,
+                                    void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+
+#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_W_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                     cham_map_data_t     *data,
+                                     cham_map_operator_t *op_fcts,
+                                     void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamRW );
+
+#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_RW_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                     cham_map_data_t     *data,
+                                     cham_map_operator_t *op_fcts,
+                                     void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamR );
+
+#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_RW_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                     cham_map_data_t     *data,
+                                     cham_map_operator_t *op_fcts,
+                                     void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+static inline void omp_map_two_RW_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                      cham_map_data_t     *data,
+                                      cham_map_operator_t *op_fcts,
+                                      void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+
+    assert( ndata == 2 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamRW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 2, descA, tileA, descB, tileB );
+    }
+}
+
+/*
+ * @brief Open map function type with three tiles
+ */
+static inline void omp_map_three_R_R_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamR );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_R_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamW );
+
+#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_R_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamR );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( in:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_W_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_W_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_W_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( in:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_RW_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_RW_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
 {
-    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
 
-    switch( accessA ) {
-    case ChamW:
-#pragma omp task depend( out: tileA[0] )
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamW );
+
+#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_R_RW_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamR );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( in:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_R_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamR );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_R_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+
+#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_R_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamR );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( out:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_W_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_W_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                        cham_map_data_t     *data,
+                                        cham_map_operator_t *op_fcts,
+                                        void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_W_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( out:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_RW_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_RW_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamW );
+
+#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_W_RW_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( out:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] )
     {
-        op_fct( A, uplo, Am, An, tileA, op_args );
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
     }
-    break;
+}
+
+static inline void omp_map_three_RW_R_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
 
-    case ChamR:
-#pragma omp task depend( in: tileA[0] )
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamR );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( in:tileC[0] )
     {
-        op_fct( A, uplo, Am, An, tileA, op_args );
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
     }
+}
+
+static inline void omp_map_three_RW_R_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
 
-    break;
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamW );
 
-    case ChamRW:
-    default:
-#pragma omp task depend( inout: tileA[0] )
+#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( out:tileC[0] )
     {
-        op_fct( A, uplo, Am, An, tileA, op_args );
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
     }
+}
+
+static inline void omp_map_three_RW_R_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamR );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( in:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
     }
+}
+
+static inline void omp_map_three_RW_W_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
 
-    (void)options;
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_RW_W_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                         cham_map_data_t     *data,
+                                         cham_map_operator_t *op_fcts,
+                                         void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_RW_W_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( out:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_RW_RW_R( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamR );
+
+#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( in:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_RW_RW_W( cham_uplo_t uplo, int m, int n, int ndata,
+                                          cham_map_data_t     *data,
+                                          cham_map_operator_t *op_fcts,
+                                          void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( out:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static inline void omp_map_three_RW_RW_RW( cham_uplo_t uplo, int m, int n, int ndata,
+                                           cham_map_data_t     *data,
+                                           cham_map_operator_t *op_fcts,
+                                           void                *op_args )
+{
+    cham_map_data_t   *A     = data;
+    const CHAM_desc_t *descA = A->desc;
+    CHAM_tile_t       *tileA = descA->get_blktile( descA, m, n );
+    cham_map_data_t   *B     = data + 1;
+    const CHAM_desc_t *descB = B->desc;
+    CHAM_tile_t       *tileB = descB->get_blktile( descB, m, n );
+    cham_map_data_t   *C     = data + 2;
+    const CHAM_desc_t *descC = C->desc;
+    CHAM_tile_t       *tileC = descC->get_blktile( descC, m, n );
+
+    assert( ndata == 3 );
+    assert( A->access == ChamRW );
+    assert( B->access == ChamRW );
+    assert( C->access == ChamRW );
+
+#pragma omp task depend( inout:tileA[0] ) depend( inout:tileB[0] ) depend( inout:tileC[0] )
+    {
+        op_fcts->cpufunc( op_args, uplo, m, n, 3, descA, tileA, descB, tileB, descC, tileC );
+    }
+}
+
+static omp_map_fct_t omp_map_one[3] = {
+    omp_map_one_R, omp_map_one_W, omp_map_one_RW
+};
+
+static omp_map_fct_t omp_map_two[3][3] = {
+    { omp_map_two_R_R,  omp_map_two_R_W,  omp_map_two_R_RW },
+    { omp_map_two_W_R,  omp_map_two_W_W,  omp_map_two_W_RW },
+    { omp_map_two_RW_R, omp_map_two_RW_W, omp_map_two_RW_RW }
+};
+
+static omp_map_fct_t omp_map_three[3][3][3] = {
+    {   { omp_map_three_R_R_R,   omp_map_three_R_R_W,   omp_map_three_R_R_RW   },
+        { omp_map_three_R_W_R,   omp_map_three_R_W_W,   omp_map_three_R_W_RW   },
+        { omp_map_three_R_RW_R,  omp_map_three_R_RW_W,  omp_map_three_R_RW_RW  }  },
+    {   { omp_map_three_W_R_R,   omp_map_three_W_R_W,   omp_map_three_W_R_RW   },
+        { omp_map_three_W_W_R,   omp_map_three_W_W_W,   omp_map_three_W_W_RW   },
+        { omp_map_three_W_RW_R,  omp_map_three_W_RW_W,  omp_map_three_W_RW_RW  }  },
+    {   { omp_map_three_RW_R_R,  omp_map_three_RW_R_W,  omp_map_three_RW_R_RW  },
+        { omp_map_three_RW_W_R,  omp_map_three_RW_W_W,  omp_map_three_RW_W_RW  },
+        { omp_map_three_RW_RW_R, omp_map_three_RW_RW_W, omp_map_three_RW_RW_RW }  }
+};
+
+
+void INSERT_TASK_map( const RUNTIME_option_t *options,
+                      cham_uplo_t uplo, int m, int n,
+                      int ndata, cham_map_data_t *data,
+                      cham_map_operator_t *op_fcts, void *op_args )
+{
+    if ( ( ndata < 0 ) || ( ndata > 3 ) ) {
+        fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" );
+        return;
+    }
+
+    switch( ndata ) {
+    case 1:
+        omp_map_one[ data[0].access - 1 ](
+            uplo, m, n, ndata, data, op_fcts, op_args );
+        break;
+
+    case 2:
+        omp_map_two[ data[0].access - 1 ][ data[1].access - 1 ](
+            uplo, m, n, ndata, data, op_fcts, op_args );
+        break;
+
+    case 3:
+        omp_map_three[ data[0].access - 1 ][ data[1].access - 1 ][ data[2].access - 1 ](
+            uplo, m, n, ndata, data, op_fcts, op_args );
+        break;
+    }
 }
diff --git a/runtime/parsec/codelets/codelet_map.c b/runtime/parsec/codelets/codelet_map.c
index 8016f857e1b1307e0436057371bb08e76b670a6e..12a701bf379b41e5e9e42a61b4df3153c4b3cf60 100644
--- a/runtime/parsec/codelets/codelet_map.c
+++ b/runtime/parsec/codelets/codelet_map.c
@@ -9,50 +9,120 @@
  *
  * @brief Chameleon map PaRSEC codelet
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2022-02-22
+ * @date 2024-03-11
  *
  */
 #include "chameleon_parsec.h"
 #include "chameleon/tasks.h"
 
+struct parsec_map_args_s {
+    cham_uplo_t          uplo;
+    int                  m, n;
+    cham_map_operator_t *op_fcts;
+    void                *op_args;
+    const CHAM_desc_t   *desc[1];
+};
+
+static inline int
+CORE_map_one_parsec( parsec_execution_stream_t *context,
+                     parsec_task_t             *this_task )
+{
+    struct parsec_map_args_s *pargs = NULL;
+    CHAM_tile_t              *tileA;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &tileA );
+    pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 1,
+                             pargs->desc[0], tileA );
+
+    free( pargs );
+}
+
+static inline int
+CORE_map_two_parsec( parsec_execution_stream_t *context,
+                     parsec_task_t             *this_task )
+{
+    struct parsec_map_args_s *pargs = NULL;
+    CHAM_tile_t              *tileA;
+    CHAM_tile_t              *tileB;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB );
+    pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 2,
+                             pargs->desc[0], tileA, pargs->desc[1], tileB );
+
+    free( pargs );
+}
+
 static inline int
-CORE_map_parsec( parsec_execution_stream_t *context,
-                 parsec_task_t             *this_task )
+CORE_map_three_parsec( parsec_execution_stream_t *context,
+                       parsec_task_t             *this_task )
 {
-    const CHAM_desc_t *desc;
-    cham_uplo_t uplo;
-    int m;
-    int n;
-    void *data;
-    cham_unary_operator_t op_fct;
-    void *op_args;
-
-    parsec_dtd_unpack_args(
-        this_task, &desc, &uplo, &m, &n, &data, &op_fct, &op_args );
-    op_fct( desc, uplo, m, n, data, op_args );
-
-    (void)context;
-    return PARSEC_HOOK_RETURN_DONE;
+    struct parsec_map_args_s *pargs = NULL;
+    CHAM_tile_t              *tileA;
+    CHAM_tile_t              *tileB;
+    CHAM_tile_t              *tileC;
+
+    parsec_dtd_unpack_args( this_task, &pargs, &tileA, &tileB, &tileC );
+    pargs->op_fcts->cpufunc( pargs->op_args, pargs->uplo, pargs->m, pargs->n, 3,
+                             pargs->desc[0], tileA, pargs->desc[1], tileB,
+                             pargs->desc[2], tileC );
+
+    free( pargs );
 }
 
 void INSERT_TASK_map( const RUNTIME_option_t *options,
-                      cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
-                      cham_unary_operator_t op_fct, void *op_args )
+                      cham_uplo_t uplo, int m, int n,
+                      int ndata, cham_map_data_t *data,
+                      cham_map_operator_t *op_fcts, void *op_args )
 {
-    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
-
-    int parsec_accessA = cham_to_parsec_access( accessA );
-
-    parsec_dtd_taskpool_insert_task(
-        PARSEC_dtd_taskpool, CORE_map_parsec, options->priority, "map",
-        sizeof(CHAM_desc_t*),             &A,    VALUE,
-        sizeof(cham_uplo_t),              &uplo, VALUE,
-        sizeof(int),                      &Am,   VALUE,
-        sizeof(int),                      &An,   VALUE,
-        PASSED_BY_REF, RTBLKADDR(A, void, Am, An), chameleon_parsec_get_arena_index( A ) | parsec_accessA,
-        sizeof(cham_unary_operator_t),    &op_fct,  VALUE,
-        sizeof(void*),                    &op_args, VALUE,
-        PARSEC_DTD_ARG_END );
+    parsec_taskpool_t        *PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    struct parsec_map_args_s *pargs      = NULL;
+    size_t                    pargs_size = 0;
+    int                       i;
+
+    if ( ( ndata < 0 ) || ( ndata > 3 ) ) {
+        fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" );
+        return;
+    }
+
+    pargs_size = sizeof( struct parsec_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * );
+    pargs = malloc( pargs_size );
+    pargs->uplo    = uplo;
+    pargs->m       = m;
+    pargs->n       = n;
+    pargs->op_fcts = op_fcts;
+    pargs->op_args = op_args;
+    for( i=0; i<ndata; i++ ) {
+        pargs->desc[i] = data[i].desc;
+    }
+
+    switch( ndata ) {
+    case 1:
+        parsec_dtd_taskpool_insert_task(
+            PARSEC_dtd_taskpool, CORE_map_one_parsec, options->priority, op_fcts->name,
+            sizeof(struct parsec_map_args_s*), &pargs, VALUE,
+            PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ),
+            PARSEC_DTD_ARG_END );
+        break;
+
+    case 2:
+        parsec_dtd_taskpool_insert_task(
+            PARSEC_dtd_taskpool, CORE_map_two_parsec, options->priority, op_fcts->name,
+            sizeof(struct parsec_map_args_s*), &pargs, VALUE,
+            PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ),
+            PASSED_BY_REF, RTBLKADDR( data[1].desc, void, m, n ), chameleon_parsec_get_arena_index( data[1].desc ) | cham_to_parsec_access( data[1].access ),
+            PARSEC_DTD_ARG_END );
+        break;
+
+    case 3:
+        parsec_dtd_taskpool_insert_task(
+            PARSEC_dtd_taskpool, CORE_map_three_parsec, options->priority, op_fcts->name,
+            sizeof(struct parsec_map_args_s*), &pargs, VALUE,
+            PASSED_BY_REF, RTBLKADDR( data[0].desc, void, m, n ), chameleon_parsec_get_arena_index( data[0].desc ) | cham_to_parsec_access( data[0].access ),
+            PASSED_BY_REF, RTBLKADDR( data[1].desc, void, m, n ), chameleon_parsec_get_arena_index( data[1].desc ) | cham_to_parsec_access( data[1].access ),
+            PASSED_BY_REF, RTBLKADDR( data[2].desc, void, m, n ), chameleon_parsec_get_arena_index( data[2].desc ) | cham_to_parsec_access( data[2].access ),
+            PARSEC_DTD_ARG_END );
+        break;
+    }
 }
diff --git a/runtime/quark/codelets/codelet_map.c b/runtime/quark/codelets/codelet_map.c
index 8b8eca82b4565883d2012a9232e7236feef6b803..b7930ff7134fe38b504b9d14e522147fa52ea6cd 100644
--- a/runtime/quark/codelets/codelet_map.c
+++ b/runtime/quark/codelets/codelet_map.c
@@ -9,42 +9,114 @@
  *
  * @brief Chameleon map Quark codelet
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
- * @date 2022-02-22
+ * @date 2024-03-11
  *
  */
 #include "chameleon_quark.h"
 #include "chameleon/tasks.h"
 
-void CORE_map_quark(Quark *quark)
+struct quark_map_args_s {
+    cham_uplo_t          uplo;
+    int                  m, n;
+    cham_map_operator_t *op_fcts;
+    void                *op_args;
+    const CHAM_desc_t   *desc[1];
+};
+
+void CORE_map_one_quark(Quark *quark)
+{
+    struct quark_map_args_s *qargs = NULL;
+    CHAM_tile_t             *tileA;
+
+    quark_unpack_args_2( quark, qargs, tileA );
+    qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 1,
+                             qargs->desc[0], tileA );
+
+    free( qargs );
+}
+
+void CORE_map_two_quark(Quark *quark)
+{
+    struct quark_map_args_s *qargs = NULL;
+    CHAM_tile_t             *tileA;
+    CHAM_tile_t             *tileB;
+
+    quark_unpack_args_3( quark, qargs, tileA, tileB );
+    qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 2,
+                             qargs->desc[0], tileA, qargs->desc[1], tileB );
+
+    free( qargs );
+}
+
+void CORE_map_three_quark(Quark *quark)
 {
-    const CHAM_desc_t *desc;
-    cham_uplo_t uplo;
-    int m;
-    int n;
-    CHAM_tile_t *tile;
-    cham_unary_operator_t op_fct;
-    void *op_args;
-
-    quark_unpack_args_7( quark, desc, uplo, m, n, tile, op_fct, op_args );
-    op_fct( desc, uplo, m, n, tile, op_args );
+    struct quark_map_args_s *qargs = NULL;
+    CHAM_tile_t             *tileA;
+    CHAM_tile_t             *tileB;
+    CHAM_tile_t             *tileC;
+
+    quark_unpack_args_4( quark, qargs, tileA, tileB, tileC );
+    qargs->op_fcts->cpufunc( qargs->op_args, qargs->uplo, qargs->m, qargs->n, 3,
+                             qargs->desc[0], tileA, qargs->desc[1], tileB,
+                             qargs->desc[2], tileC );
+
+    free( qargs );
 }
 
 void INSERT_TASK_map( const RUNTIME_option_t *options,
-                      cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
-                      cham_unary_operator_t op_fct, void *op_args )
+                      cham_uplo_t uplo, int m, int n,
+                      int ndata, cham_map_data_t *data,
+                      cham_map_operator_t *op_fcts, void *op_args )
 {
-    quark_option_t *opt = (quark_option_t*)(options->schedopt);
-
-    QUARK_Insert_Task(
-        opt->quark, CORE_map_quark, (Quark_Task_Flags*)opt,
-        sizeof(CHAM_desc_t*),             &A,    VALUE,
-        sizeof(cham_uplo_t),              &uplo, VALUE,
-        sizeof(int),                      &Am,   VALUE,
-        sizeof(int),                      &An,   VALUE,
-        sizeof(void*), RTBLKADDR(A, void, Am, An), cham_to_quark_access( accessA ),
-        sizeof(cham_unary_operator_t),    &op_fct,  VALUE,
-        sizeof(void*),                    &op_args, VALUE,
-        0);
+    struct quark_map_args_s *qargs = NULL;
+    quark_option_t          *opt   = (quark_option_t*)(options->schedopt);
+    size_t                   qargs_size = 0;
+    int                      i;
+
+    if ( ( ndata < 0 ) || ( ndata > 3 ) ) {
+        fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" );
+        return;
+    }
+
+    qargs_size = sizeof( struct quark_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * );
+    qargs = malloc( qargs_size );
+    qargs->uplo    = uplo;
+    qargs->m       = m;
+    qargs->n       = n;
+    qargs->op_fcts = op_fcts;
+    qargs->op_args = op_args;
+    for( i=0; i<ndata; i++ ) {
+        qargs->desc[i] = data[i].desc;
+    }
+
+    switch( ndata ) {
+    case 1:
+        QUARK_Insert_Task(
+            opt->quark, CORE_map_one_quark, (Quark_Task_Flags*)opt,
+            sizeof(void*), &qargs, VALUE,
+            sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ),
+            0 );
+        break;
+
+    case 2:
+        QUARK_Insert_Task(
+            opt->quark, CORE_map_two_quark, (Quark_Task_Flags*)opt,
+            sizeof(void*), &qargs, VALUE,
+            sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ),
+            sizeof(void*), RTBLKADDR( data[1].desc, void, m, n), cham_to_quark_access( data[1].access ),
+            0 );
+        break;
+
+    case 3:
+        QUARK_Insert_Task(
+            opt->quark, CORE_map_three_quark, (Quark_Task_Flags*)opt,
+            sizeof(void*), &qargs, VALUE,
+            sizeof(void*), RTBLKADDR( data[0].desc, void, m, n), cham_to_quark_access( data[0].access ),
+            sizeof(void*), RTBLKADDR( data[1].desc, void, m, n), cham_to_quark_access( data[1].access ),
+            sizeof(void*), RTBLKADDR( data[2].desc, void, m, n), cham_to_quark_access( data[2].access ),
+            0 );
+        break;
+    }
 }
diff --git a/runtime/starpu/codelets/codelet_map.c b/runtime/starpu/codelets/codelet_map.c
index 6bdaed3eb2bf37abd32c63eeafbf1d2b438dac5e..2ef297b0d71e0152a5e997be70bf0880e2260046 100644
--- a/runtime/starpu/codelets/codelet_map.c
+++ b/runtime/starpu/codelets/codelet_map.c
@@ -12,66 +12,332 @@
  * @version 1.3.0
  * @author Mathieu Faverge
  * @author Florent Pruvost
- * @date 2023-07-06
+ * @date 2024-03-11
  *
  */
 #include "chameleon_starpu.h"
 #include "runtime_codelet_z.h"
 
-CHAMELEON_CL_CB(map, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N)
+struct cl_map_args_s {
+    cham_uplo_t          uplo;
+    int                  m, n;
+    cham_map_operator_t *op_fcts;
+    void                *op_args;
+    const CHAM_desc_t   *desc[1];
+};
 
+/*
+ * Map with a single tile as parameter
+ */
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_map_one_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    CHAM_tile_t          *tileA;
+
+    tileA = cti_interface_get( descr[0] );
+    clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1,
+                              clargs->desc[0], tileA );
+}
+
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_map_one_cuda_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    cublasHandle_t        handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t          *tileA;
+
+    tileA = cti_interface_get( descr[0] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1,
+                               clargs->desc[0], tileA );
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_map_one_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    hipblasHandle_t       handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t          *tileA;
+
+    tileA = cti_interface_get( descr[0] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 1,
+                              clargs->desc[0], tileA );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CHAMELEON_CL_CB( map_one, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N )
+#if defined(CHAMELEON_USE_HIP)
+    CODELETS_GPU( map_one, cl_map_one_cpu_func, cl_map_one_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( map_one, cl_map_one_cpu_func, cl_map_one_cuda_func, STARPU_CUDA_ASYNC )
+#endif
+
+/*
+ * Map with two tiles as parameter
+ */
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_map_two_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2,
+                              clargs->desc[0], tileA, clargs->desc[1], tileB );
+}
+
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_map_two_cuda_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    cublasHandle_t        handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2,
+                               clargs->desc[0], tileA, clargs->desc[1], tileB );
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_map_two_hip_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    hipblasHandle_t       handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 2,
+                              clargs->desc[0], tileA, clargs->desc[1], tileB );
+}
+#endif /* defined(CHAMELEON_USE_HIP) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CHAMELEON_CL_CB( map_two, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N )
+#if defined(CHAMELEON_USE_HIP)
+    CODELETS_GPU( map_two, cl_map_two_cpu_func, cl_map_two_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( map_two, cl_map_two_cpu_func, cl_map_two_cuda_func, STARPU_CUDA_ASYNC )
+#endif
+
+/*
+ * Map with three tiles as parameter
+ */
 #if !defined(CHAMELEON_SIMULATION)
-static void cl_map_cpu_func(void *descr[], void *cl_arg)
+static void cl_map_three_cpu_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+    CHAM_tile_t          *tileC;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    tileC = cti_interface_get( descr[2] );
+    clargs->op_fcts->cpufunc( clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3,
+                              clargs->desc[0], tileA, clargs->desc[1], tileB,
+                              clargs->desc[2], tileC );
+}
+
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_map_three_cuda_func( void *descr[], void *cl_arg )
+{
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    cublasHandle_t        handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+    CHAM_tile_t          *tileC;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    tileC = cti_interface_get( descr[2] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->cudafunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3,
+                               clargs->desc[0], tileA, clargs->desc[1], tileB,
+                               clargs->desc[2], tileC );
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+
+#if defined(CHAMELEON_USE_HIP)
+static void
+cl_map_three_hip_func( void *descr[], void *cl_arg )
 {
-    const CHAM_desc_t *desc;
-    cham_uplo_t uplo;
-    int m;
-    int n;
-    CHAM_tile_t *tile;
-    cham_unary_operator_t op_fct;
-    void *op_args;
-
-    tile = cti_interface_get(descr[0]);
-    starpu_codelet_unpack_args(cl_arg, &desc, &uplo, &m, &n, &op_fct, &op_args );
-    op_fct( desc, uplo, m, n, tile, op_args );
+    struct cl_map_args_s *clargs = (struct cl_map_args_s*)cl_arg;
+    hipblasHandle_t       handle = starpu_hipblas_get_local_handle();
+    CHAM_tile_t          *tileA;
+    CHAM_tile_t          *tileB;
+    CHAM_tile_t          *tileC;
+
+    tileA = cti_interface_get( descr[0] );
+    tileB = cti_interface_get( descr[1] );
+    tileC = cti_interface_get( descr[2] );
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileC->format & CHAMELEON_TILE_FULLRANK );
+    clargs->op_fcts->hipfunc( handle, clargs->op_args, clargs->uplo, clargs->m, clargs->n, 3,
+                              clargs->desc[0], tileA, clargs->desc[1], tileB,
+                              clargs->desc[2], tileC );
 }
+#endif /* defined(CHAMELEON_USE_HIP) */
 #endif /* !defined(CHAMELEON_SIMULATION) */
 
 /*
  * Codelet definition
  */
-CODELETS_CPU(map, cl_map_cpu_func)
+CHAMELEON_CL_CB( map_three, cti_handle_get_m( task->handles[0] ), cti_handle_get_n( task->handles[0] ), 0, M * N )
+#if defined(CHAMELEON_USE_HIP)
+    CODELETS_GPU( map_three, cl_map_three_cpu_func, cl_map_three_hip_func, STARPU_HIP_ASYNC )
+#else
+CODELETS( map_three, cl_map_three_cpu_func, cl_map_three_cuda_func, STARPU_CUDA_ASYNC )
+#endif
 
 void INSERT_TASK_map( const RUNTIME_option_t *options,
-                      cham_access_t accessA, cham_uplo_t uplo, const CHAM_desc_t *A, int Am, int An,
-                      cham_unary_operator_t op_fct, void *op_args )
+                      cham_uplo_t uplo, int m, int n,
+                      int ndata, cham_map_data_t *data,
+                      cham_map_operator_t *op_fcts, void *op_args )
 {
+    struct cl_map_args_s *clargs  = NULL;
+    const char           *cl_name = (op_fcts->name == NULL) ? "map" : op_fcts->name;
+    int                   exec    = 0;
+    int                   i, readonly = 1;
+    size_t                clargs_size = 0;
+    void (*callback)(void*);
 
-    struct starpu_codelet *codelet = &cl_map;
-    void (*callback)(void*) = options->profiling ? cl_map_callback : NULL;
-    char                  *cl_name = "map";
+    if ( ( ndata < 0 ) || ( ndata > 3 ) ) {
+        fprintf( stderr, "INSERT_TASK_map() can handle only 1 to 3 parameters\n" );
+        return;
+    }
 
     CHAMELEON_BEGIN_ACCESS_DECLARATION;
-    CHAMELEON_ACCESS_RW(A, Am, An);
+    for( i=0; i<ndata; i++ ) {
+        if ( data[i].access == ChamRW ) {
+            CHAMELEON_ACCESS_RW( data[i].desc, m, n );
+            readonly = 0;
+        }
+        else if ( data[i].access == ChamW ) {
+            CHAMELEON_ACCESS_W( data[i].desc, m, n );
+            readonly = 0;
+        }
+        else {
+            CHAMELEON_ACCESS_R( data[i].desc, m, n );
+        }
+    }
+    exec = __chameleon_need_exec;
+    /* Force execution for read-only functions */
+    if ( readonly && __chameleon_need_submit ) {
+        exec = 1;
+    }
     CHAMELEON_END_ACCESS_DECLARATION;
 
-    cl_name = chameleon_codelet_name( cl_name, 1,
-                                      A->get_blktile( A, Am, An ) );
-
-    rt_starpu_insert_task(
-        codelet,
-        STARPU_VALUE,    &A,                      sizeof(CHAM_desc_t*),
-        STARPU_VALUE,    &uplo,                   sizeof(cham_uplo_t),
-        STARPU_VALUE,    &Am,                     sizeof(int),
-        STARPU_VALUE,    &An,                     sizeof(int),
-        cham_to_starpu_access(accessA), RTBLKADDR(A, ChamByte, Am, An),
-        STARPU_VALUE,    &op_fct,                 sizeof(cham_unary_operator_t),
-        STARPU_VALUE,    &op_args,                sizeof(void*),
-        STARPU_PRIORITY,  options->priority,
-        STARPU_CALLBACK,  callback,
-        STARPU_EXECUTE_ON_WORKER, options->workerid,
+    if ( exec ) {
+        clargs_size = sizeof( struct cl_map_args_s ) + (ndata - 1) * sizeof( CHAM_desc_t * );
+        clargs = malloc( clargs_size );
+        clargs->uplo    = uplo;
+        clargs->m       = m;
+        clargs->n       = n;
+        clargs->op_fcts = op_fcts;
+        clargs->op_args = op_args;
+        for( i=0; i<ndata; i++ ) {
+            clargs->desc[i] = data[i].desc;
+        }
+    }
+
+    /* Refine name */
+    for( i=0; i<ndata; i++ ) {
+        cl_name = chameleon_codelet_name( cl_name, 1,
+                                          (data[i].desc)->get_blktile( data[i].desc, m, n ) );
+    }
+
+    /* Insert the task */
+    switch( ndata ) {
+    case 1:
+        callback = options->profiling ? cl_map_one_callback : NULL;
+        rt_starpu_insert_task(
+            &cl_map_one,
+            /* Task codelet arguments */
+            STARPU_CL_ARGS, clargs, clargs_size,
+
+            /* Task handles */
+            cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ),
+
+            /* Common task arguments */
+            STARPU_PRIORITY,          options->priority,
+            STARPU_CALLBACK,          callback,
+            STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+            STARPU_NAME,              cl_name,
+#endif
+            0 );
+        break;
+
+    case 2:
+        callback = options->profiling ? cl_map_two_callback : NULL;
+        rt_starpu_insert_task(
+            &cl_map_two,
+            /* Task codelet arguments */
+            STARPU_CL_ARGS, clargs, clargs_size,
+
+            /* Task handles */
+            cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ),
+            cham_to_starpu_access( data[1].access ), RTBLKADDR( data[1].desc, ChamByte, m, n ),
+
+            /* Common task arguments */
+            STARPU_PRIORITY,          options->priority,
+            STARPU_CALLBACK,          callback,
+            STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+            STARPU_NAME,              cl_name,
+#endif
+            0 );
+        break;
+
+    case 3:
+        callback = options->profiling ? cl_map_three_callback : NULL;
+        rt_starpu_insert_task(
+            &cl_map_three,
+            /* Task codelet arguments */
+            STARPU_CL_ARGS, clargs, clargs_size,
+
+            /* Task handles */
+            cham_to_starpu_access( data[0].access ), RTBLKADDR( data[0].desc, ChamByte, m, n ),
+            cham_to_starpu_access( data[1].access ), RTBLKADDR( data[1].desc, ChamByte, m, n ),
+            cham_to_starpu_access( data[2].access ), RTBLKADDR( data[2].desc, ChamByte, m, n ),
+
+            /* Common task arguments */
+            STARPU_PRIORITY,          options->priority,
+            STARPU_CALLBACK,          callback,
+            STARPU_EXECUTE_ON_WORKER, options->workerid,
 #if defined(CHAMELEON_CODELETS_HAVE_NAME)
-        STARPU_NAME, cl_name,
+            STARPU_NAME,              cl_name,
 #endif
-        0);
+            0 );
+        break;
+    }
 }