From de10a29dbe5ebfeaefd5bbd3a13290b1943525c7 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Wed, 30 Aug 2023 14:01:27 +0200
Subject: [PATCH] codelets: Add zlaswp_{get,set} codelets

---
 include/chameleon/tasks_z.h               |  12 ++-
 runtime/CMakeLists.txt                    |   3 +-
 runtime/openmp/codelets/codelet_zlaswp.c  |  62 +++++++++++++
 runtime/parsec/codelets/codelet_zlaswp.c  |  92 ++++++++++++++++++
 runtime/parsec/include/chameleon_parsec.h |  24 ++++-
 runtime/quark/codelets/codelet_zlaswp.c   |  78 ++++++++++++++++
 runtime/starpu/codelets/codelet_zlaswp.c  | 108 ++++++++++++++++++++++
 7 files changed, 374 insertions(+), 5 deletions(-)
 create mode 100644 runtime/openmp/codelets/codelet_zlaswp.c
 create mode 100644 runtime/parsec/codelets/codelet_zlaswp.c
 create mode 100644 runtime/quark/codelets/codelet_zlaswp.c
 create mode 100644 runtime/starpu/codelets/codelet_zlaswp.c

diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index c5704884e..c5fdfdf4e 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -24,7 +24,7 @@
  * @author Alycia Lisito
  * @author Romain Peressoni
  * @author Matthieu Kuhn
- * @date 2023-08-22
+ * @date 2023-08-31
  * @precisions normal z -> c d s
  *
  */
@@ -186,6 +186,16 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options,
 void INSERT_TASK_zlaset2( const RUNTIME_option_t *options,
                           cham_uplo_t uplo, int n1, int n2, CHAMELEON_Complex64_t alpha,
                           const CHAM_desc_t *tileA, int tileAm, int tileAn );
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *tIPIV, int tIPIVk,
+                             const CHAM_desc_t *tileA, int tileAm, int tileAn,
+                             const CHAM_desc_t *tileB, int tileBm, int tileBn );
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *tIPIV, int tIPIVk,
+                             const CHAM_desc_t *tileA, int tileAm, int tileAn,
+                             const CHAM_desc_t *tileB, int tileBm, int tileBn );
 void INSERT_TASK_zlatro( const RUNTIME_option_t *options,
                          cham_uplo_t uplo, cham_trans_t trans, int m, int n, int mb,
                          const CHAM_desc_t *A, int Am, int An,
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 527034336..f011e6d96 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -76,8 +76,9 @@ set(CODELETS_ZSRC
     codelets/codelet_zlanhe.c
     codelets/codelet_zlansy.c
     codelets/codelet_zlantr.c
-    codelets/codelet_zlaset2.c
     codelets/codelet_zlaset.c
+    codelets/codelet_zlaset2.c
+    codelets/codelet_zlaswp.c
     codelets/codelet_zlatro.c
     codelets/codelet_zlauum.c
     codelets/codelet_zplghe.c
diff --git a/runtime/openmp/codelets/codelet_zlaswp.c b/runtime/openmp/codelets/codelet_zlaswp.c
new file mode 100644
index 000000000..452b73926
--- /dev/null
+++ b/runtime/openmp/codelets/codelet_zlaswp.c
@@ -0,0 +1,62 @@
+/**
+ *
+ * @file openmp/codelet_zlaswp.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon OpenMP codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_openmp.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileU = U->get_blktile( U, Um, Un );
+    int         *perm  = NULL; // get perm from ipiv
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileU->format & CHAMELEON_TILE_FULLRANK );
+
+#pragma omp task firstprivate( m0, k, ipiv, tileA, tileU ) depend( in:perm ) depend( in:tileA[0] ) depend( inout:tileU[0] )
+    {
+        TCORE_zlaswp_get( m0, A->m, A->n, k, tileA, tileU, perm );
+    }
+
+    (void)options;
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+    int         *invp  = NULL; // get invp from ipiv
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+#pragma omp task firstprivate( m0, k, ipiv, tileA, tileB ) depend( in:invp ) depend( in:tileA[0] ) depend( inout:tileB[0] )
+    {
+        TCORE_zlaswp_set( m0, A->m, A->n, k, tileA, tileB, invp );
+    }
+
+    (void)options;
+}
diff --git a/runtime/parsec/codelets/codelet_zlaswp.c b/runtime/parsec/codelets/codelet_zlaswp.c
new file mode 100644
index 000000000..284c450aa
--- /dev/null
+++ b/runtime/parsec/codelets/codelet_zlaswp.c
@@ -0,0 +1,92 @@
+/**
+ *
+ * @file parsec/codelet_zlaswp.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon PaRSEC codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_parsec.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_z.h"
+
+static inline int
+CORE_zlaswp_get_parsec( parsec_execution_stream_t *context,
+                        parsec_task_t             *this_task )
+{
+    int          m0, m, n, k, lda, ldb, *perm;
+    CHAMELEON_Complex64_t *A, *B;
+
+    parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &perm );
+
+    CORE_zlaswp_get( m0, m, n, k, A, lda, B, ldb, perm );
+}
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileU = U->get_blktile( U, Um, Un );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zlaswp_get_parsec, options->priority, "laswp_get",
+        sizeof(int),          &m0,         VALUE,
+        sizeof(int),          &(tileA->m), VALUE,
+        sizeof(int),          &(tileA->n), VALUE,
+        sizeof(int),          &k,          VALUE,
+        PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &(tileA->ld), VALUE,
+        PASSED_BY_REF, RTBLKADDR(U, ChamComplexDouble, Um, Un), chameleon_parsec_get_arena_index( U ) | INOUT,
+        sizeof(int),         &(tileU->ld), VALUE,
+        PASSED_BY_REF, RUNTIME_perm_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_perm( ipiv ) | INPUT,
+        PARSEC_DTD_ARG_END );
+}
+
+static inline int
+CORE_zlaswp_set_parsec( parsec_execution_stream_t *context,
+                        parsec_task_t             *this_task )
+{
+    int          m0, m, n, k, lda, ldb, *invp;
+    CHAMELEON_Complex64_t *A, *B;
+
+    parsec_dtd_unpack_args( this_task, &m0, &m, &n, &k, &A, lda, &B, ldb, &invp );
+
+    CORE_zlaswp_set( m0, m, n, k, A, lda, B, ldb, invp );
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt);
+    CHAM_tile_t *tileA = A->get_blktile( A, Am, An );
+    CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn );
+
+    parsec_dtd_taskpool_insert_task(
+        PARSEC_dtd_taskpool, CORE_zlaswp_set_parsec, options->priority, "laswp_set",
+        sizeof(int),          &m0,         VALUE,
+        sizeof(int),          &(tileB->m), VALUE,
+        sizeof(int),          &(tileB->n), VALUE,
+        sizeof(int),          &k,          VALUE,
+        PASSED_BY_REF, RTBLKADDR(A, ChamComplexDouble, Am, An), chameleon_parsec_get_arena_index( A ) | INPUT,
+        sizeof(int),         &(tileA->ld), VALUE,
+        PASSED_BY_REF, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), chameleon_parsec_get_arena_index( B ) | INOUT,
+        sizeof(int),         &(tileB->ld), VALUE,
+        PASSED_BY_REF, RUNTIME_invp_getaddr( ipiv, ipivk ),     chameleon_parsec_get_arena_index_invp( ipiv ) | INPUT,
+        PARSEC_DTD_ARG_END );
+}
diff --git a/runtime/parsec/include/chameleon_parsec.h b/runtime/parsec/include/chameleon_parsec.h
index 30518fb80..23d19fd3f 100644
--- a/runtime/parsec/include/chameleon_parsec.h
+++ b/runtime/parsec/include/chameleon_parsec.h
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon PaRSEC runtime header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Mathieu Faverge
  * @author Reazul Hoque
  * @author Florent Pruvost
  * @author Samuel Thibault
- * @date 2022-02-22
+ * @date 2023-08-31
  *
  */
 #ifndef _chameleon_parsec_h_
@@ -38,10 +38,28 @@ struct chameleon_parsec_desc_s {
 typedef struct chameleon_parsec_desc_s chameleon_parsec_desc_t;
 
 static inline int
-chameleon_parsec_get_arena_index(const CHAM_desc_t *desc) {
+chameleon_parsec_get_arena_index( const CHAM_desc_t *desc ) {
     return ((chameleon_parsec_desc_t *)desc->schedopt)->arena_index;
 }
 
+static inline int
+chameleon_parsec_get_arena_index_ipiv( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
+static inline int
+chameleon_parsec_get_arena_index_perm( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
+static inline int
+chameleon_parsec_get_arena_index_invp( const CHAM_ipiv_t *ipiv ) {
+    assert(0);
+    return -1;
+}
+
 static inline int cham_to_parsec_access( cham_access_t accessA ) {
     if ( accessA == ChamR ) {
         return INPUT;
diff --git a/runtime/quark/codelets/codelet_zlaswp.c b/runtime/quark/codelets/codelet_zlaswp.c
new file mode 100644
index 000000000..117d67618
--- /dev/null
+++ b/runtime/quark/codelets/codelet_zlaswp.c
@@ -0,0 +1,78 @@
+/**
+ *
+ * @file quark/codelet_zlaswp.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon Quark codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_quark.h"
+#include "chameleon/tasks_z.h"
+#include "coreblas/coreblas_ztile.h"
+
+static void CORE_zlaswp_get_quark( Quark *quark )
+{
+    int          m0, k, *perm;
+    CHAM_tile_t *A, *B;
+
+    quark_unpack_args_5( quark, m0, k, perm, A, B );
+
+    TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm );
+}
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_LASWP;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_zlaswp_get_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),          &m0, VALUE,
+        sizeof(int),          &k,  VALUE,
+        sizeof(int*),         RUNTIME_perm_getaddr( ipiv, ipivk ),     INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(U, ChamComplexDouble, Um, Un), INOUT,
+        0 );
+}
+
+static void CORE_zlaswp_set_quark( Quark *quark )
+{
+    int          m0, k, *invp;
+    CHAM_tile_t *A, *B;
+
+    quark_unpack_args_5( quark, m0, k, invp, A, B );
+
+    TCORE_zlaswp_set( m0, A->m, A->n, k, A, B, invp );
+}
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    quark_option_t *opt = (quark_option_t*)(options->schedopt);
+    DAG_CORE_LASWP;
+
+    QUARK_Insert_Task(
+        opt->quark, CORE_zlaswp_set_quark, (Quark_Task_Flags*)opt,
+        sizeof(int),          &m0, VALUE,
+        sizeof(int),          &k,  VALUE,
+        sizeof(int*),         RUNTIME_invp_getaddr( ipiv, ipivk ),     INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(A, ChamComplexDouble, Am, An), INPUT,
+        sizeof(CHAM_tile_t*), RTBLKADDR(B, ChamComplexDouble, Bm, Bn), INOUT,
+        0 );
+}
diff --git a/runtime/starpu/codelets/codelet_zlaswp.c b/runtime/starpu/codelets/codelet_zlaswp.c
new file mode 100644
index 000000000..2d8fc31d4
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zlaswp.c
@@ -0,0 +1,108 @@
+/**
+ *
+ * @file starpu/codelet_zlaswp.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon StarPU codelets to apply zlaswp on a panel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-08-31
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zlaswp_get_cpu_func( void *descr[], void *cl_arg )
+{
+    int          m0, k, *perm;
+    CHAM_tile_t *A, *B;
+
+    starpu_codelet_unpack_args( cl_arg, &m0, &k );
+
+    perm = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
+    A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
+    B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
+
+    TCORE_zlaswp_get( m0, A->m, A->n, k, A, B, perm );
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zlaswp_get, cl_zlaswp_get_cpu_func )
+
+void INSERT_TASK_zlaswp_get( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *U, int Um, int Un )
+{
+    struct starpu_codelet *codelet = &cl_zlaswp_get;
+
+    //void (*callback)(void*) = options->profiling ? cl_zlaswp_get_callback : NULL;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,               &m0, sizeof(int),
+        STARPU_VALUE,               &k,  sizeof(int),
+        STARPU_R,                   RUNTIME_perm_getaddr( ipiv, ipivk ),
+        STARPU_R,                   RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_RW | STARPU_COMMUTE, RTBLKADDR(U, ChamComplexDouble, Um, Un),
+        STARPU_PRIORITY,            options->priority,
+        //STARPU_CALLBACK,            callback,
+        STARPU_EXECUTE_ON_WORKER,   options->workerid,
+        0 );
+}
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zlaswp_set_cpu_func( void *descr[], void *cl_arg )
+{
+    int          m0, k, *invp;
+    CHAM_tile_t *A, *B;
+
+    starpu_codelet_unpack_args( cl_arg, &m0, &k );
+
+    invp = (int *)STARPU_VECTOR_GET_PTR( descr[0] );
+    A    = (CHAM_tile_t *) cti_interface_get( descr[1] );
+    B    = (CHAM_tile_t *) cti_interface_get( descr[2] );
+
+    TCORE_zlaswp_set( m0, B->m, B->n, k, A, B, invp );
+}
+#endif
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zlaswp_set, cl_zlaswp_set_cpu_func )
+
+void INSERT_TASK_zlaswp_set( const RUNTIME_option_t *options,
+                             int m0, int k,
+                             const CHAM_ipiv_t *ipiv, int ipivk,
+                             const CHAM_desc_t *A, int Am, int An,
+                             const CHAM_desc_t *B, int Bm, int Bn )
+{
+    struct starpu_codelet *codelet = &cl_zlaswp_set;
+
+    //void (*callback)(void*) = options->profiling ? cl_zlaswp_set_callback : NULL;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,             &m0, sizeof(int),
+        STARPU_VALUE,             &k,  sizeof(int),
+        STARPU_R,                 RUNTIME_invp_getaddr( ipiv, ipivk ),
+        STARPU_R,                 RTBLKADDR(A, ChamComplexDouble, Am, An),
+        STARPU_RW,                RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+        STARPU_PRIORITY,          options->priority,
+        //STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+        0 );
+}
-- 
GitLab