From 1b943777f61dc336a6818950031a3990d95e0c35 Mon Sep 17 00:00:00 2001
From: Matthieu KUHN <bkuhnm@l0.spartan.bench.local>
Date: Thu, 31 Mar 2022 14:46:46 +0200
Subject: [PATCH] codelet/starpu: Add the tasks for LU factorization without
 pivoting per column

---
 include/chameleon/tasks_z.h              |  16 +-
 runtime/CMakeLists.txt                   |   3 +-
 runtime/starpu/codelets/codelet_zpanel.c | 178 +++++++++++++++++++++++
 3 files changed, 195 insertions(+), 2 deletions(-)
 create mode 100644 runtime/starpu/codelets/codelet_zpanel.c

diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h
index b37d605e7..fa6956abd 100644
--- a/include/chameleon/tasks_z.h
+++ b/include/chameleon/tasks_z.h
@@ -23,7 +23,7 @@
  * @author Florent Pruvost
  * @author Alycia Lisito
  * @author Romain Peressoni
- * @date 2022-02-22
+ * @date 2023-02-21
  * @precisions normal z -> c d s
  *
  */
@@ -476,4 +476,18 @@ void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An );
 void RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options,
                                   const CHAM_desc_t *A, int Am, int An );
 
+/*
+ * Tasks for LU factorization with partial pivoting
+ */
+void INSERT_TASK_zgetrf_panel_nopiv_percol_diag( const RUNTIME_option_t *options,
+                                                 int m, int n, int k,
+                                                 const CHAM_desc_t *A, int Am, int An,
+                                                 const CHAM_desc_t *U, int Um, int Un,
+                                                 int iinfo );
+
+void INSERT_TASK_zgetrf_panel_nopiv_percol_trsm( const RUNTIME_option_t *options,
+                                                 int m, int n, int k,
+                                                 const CHAM_desc_t *A, int Am, int An,
+                                                 const CHAM_desc_t *U, int Um, int Un );
+
 #endif /* _chameleon_tasks_z_h_ */
diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 918310088..125e16fd9 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -23,7 +23,7 @@
 #  @author Mathieu Faverge
 #  @author Florent Pruvost
 #  @author Philippe Virouleau
-#  @date 2022-02-22
+#  @date 2023-02-21
 #
 ###
 
@@ -65,6 +65,7 @@ set(CODELETS_ZSRC
     codelets/codelet_zgetrf.c
     codelets/codelet_zgetrf_incpiv.c
     codelets/codelet_zgetrf_nopiv.c
+    codelets/codelet_zpanel.c
     codelets/codelet_zhe2ge.c
     codelets/codelet_zherfb.c
     codelets/codelet_zhessq.c
diff --git a/runtime/starpu/codelets/codelet_zpanel.c b/runtime/starpu/codelets/codelet_zpanel.c
new file mode 100644
index 000000000..0b917134e
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_zpanel.c
@@ -0,0 +1,178 @@
+/**
+ *
+ * @file starpu/codelet_zpanel.c
+ *
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon zpanel StarPU codelets
+ *
+ * @version 1.2.0
+ * @comment Codelets to perform panel factorization with partial pivoting
+ *
+ * @author Mathieu Faverge
+ * @author Matthieu Kuhn
+ * @date 2023-02-21
+ * @precisions normal z -> c d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_z.h"
+#include <coreblas/cblas_wrapper.h>
+
+static const CHAMELEON_Complex64_t zone  = (CHAMELEON_Complex64_t) 1.0;
+static const CHAMELEON_Complex64_t mzone = (CHAMELEON_Complex64_t)-1.0;
+
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgetrf_panel_nopiv_percol_diag_cpu_func( void *descr[], void *cl_arg )
+{
+    CHAM_tile_t           *tileA, *tileU;
+    int                    m, n, k, lda, iinfo;
+    RUNTIME_sequence_t    *sequence;
+    RUNTIME_request_t     *request;
+    CHAMELEON_Complex64_t *A, *row, pivot;
+
+    tileA = cti_interface_get( descr[0] );
+    tileU = cti_interface_get( descr[1] );
+
+    starpu_codelet_unpack_args( cl_arg, &m, &n, &k, &iinfo, &sequence, &request );
+
+    A   = tileA->mat;
+    lda = tileA->ld;
+    row = tileU->mat;
+
+    /* Shift to the diagonal element */
+    A += k * (lda + 1);
+
+    /* Extract row into buffer */
+    cblas_zcopy( n-k, A, lda, row, 1 );
+
+    /* Perform update on current diagonal block directly here */
+    if ( *row == 0. ) {
+        if ( sequence->status == CHAMELEON_SUCCESS ) {
+            RUNTIME_sequence_flush( NULL, sequence, request, iinfo+k+1 );
+        }
+        return;
+    }
+
+    pivot = 1. / *row;
+    cblas_zscal( m-k-1, CBLAS_SADDR( pivot ), A + 1, 1 );
+
+    CORE_zgemm( ChamNoTrans, ChamNoTrans,
+                m-k-1, n-k-1, 1,
+                mzone, A   + 1,       lda,
+                       row + 1,       1,
+                zone,  A   + 1 + lda, lda );
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zgetrf_panel_nopiv_percol_diag, cl_zgetrf_panel_nopiv_percol_diag_cpu_func );
+
+void INSERT_TASK_zgetrf_panel_nopiv_percol_diag( const RUNTIME_option_t *options,
+                                                 int m, int n, int k,
+                                                 const CHAM_desc_t *A, int Am, int An,
+                                                 const CHAM_desc_t *U, int Um, int Un,
+                                                 int iinfo )
+{
+    struct starpu_codelet *codelet = &cl_zgetrf_panel_nopiv_percol_diag;
+    // void (*callback)(void*) = options->profiling ? cl_zgetrf_panel_nopiv_percol_diag_callback : NULL;
+    void (*callback)(void*) = NULL;
+
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_RW( A, Am, An );
+    CHAMELEON_ACCESS_W( U, Um, Un );
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,             &m,                   sizeof(int),
+        STARPU_VALUE,             &n,                   sizeof(int),
+        STARPU_VALUE,             &k,                   sizeof(int),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_W,                 RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+        STARPU_VALUE,             &iinfo,               sizeof(int),
+        STARPU_VALUE,             &(options->sequence), sizeof(RUNTIME_sequence_t*),
+        STARPU_VALUE,             &(options->request),  sizeof(RUNTIME_request_t*),
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME,              "zgetrf_panel_nopiv_percol_diag",
+#endif
+        0);
+}
+
+/*
+ * Update column blocs
+ */
+#if !defined(CHAMELEON_SIMULATION)
+static void cl_zgetrf_panel_nopiv_percol_trsm_cpu_func( void *descr[], void *cl_arg )
+{
+    CHAM_tile_t           *tileA, *tileU;
+    int                    m, n, k, lda;
+    CHAMELEON_Complex64_t *A, *row, pivot;
+
+    tileA = cti_interface_get( descr[0] );
+    tileU = cti_interface_get( descr[1] );
+
+    starpu_codelet_unpack_args( cl_arg, &m, &n, &k );
+
+    A   = tileA->mat;
+    lda = tileA->ld;
+    row = tileU->mat;
+
+    /* Shift to the right column */
+    A += k * lda;
+
+    pivot = 1. / *row;
+    cblas_zscal( m, CBLAS_SADDR( pivot ), A, 1 );
+
+    /* Update trailing matrix from k+1 to n */
+    CORE_zgemm( ChamNoTrans, ChamNoTrans,
+                m, n-k-1, 1,
+                mzone, A,       lda,
+                       row + 1, 1,
+                zone,  A + lda, lda );
+}
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS_CPU( zgetrf_panel_nopiv_percol_trsm, cl_zgetrf_panel_nopiv_percol_trsm_cpu_func );
+
+void INSERT_TASK_zgetrf_panel_nopiv_percol_trsm( const RUNTIME_option_t *options,
+                                                 int m, int n, int k,
+                                                 const CHAM_desc_t *A, int Am, int An,
+                                                 const CHAM_desc_t *U, int Um, int Un )
+{
+    struct starpu_codelet *codelet = &cl_zgetrf_panel_nopiv_percol_trsm;
+    // void (*callback)(void*) = options->profiling ? cl_zgetrf_panel_nopiv_percol_trsm_callback : NULL;
+    void (*callback)(void*) = NULL;
+
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_RW(A, Am, An);
+    CHAMELEON_ACCESS_R(U, Um, Un);
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,             &m, sizeof(int),
+        STARPU_VALUE,             &n, sizeof(int),
+        STARPU_VALUE,             &k, sizeof(int),
+        STARPU_RW,                RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An),
+        STARPU_R,                 RTBLKADDR(U, CHAMELEON_Complex64_t, Um, Un),
+        STARPU_PRIORITY,          options->priority,
+        STARPU_CALLBACK,          callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "zgetrf_panel_nopiv_percol_trsm",
+#endif
+        0);
+}
+
-- 
GitLab