From 06432c12c7a7ccea3b15900a5bdc4b5d9831fe88 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Mon, 22 May 2023 11:18:18 -0400
Subject: [PATCH] starpu/codelets: Add dlag2h/hlag2d codelets

---
 gpucublas/include/gpucublas/gpucublas_z.h   |   6 +-
 runtime/starpu/CMakeLists.txt               |   5 +-
 runtime/starpu/codelets/codelet_dlag2h.c    | 161 ++++++++++++++++++++
 runtime/starpu/codelets/codelet_zcallback.c |   8 +-
 runtime/starpu/include/runtime_codelet_z.h  |   5 +
 5 files changed, 179 insertions(+), 6 deletions(-)
 create mode 100644 runtime/starpu/codelets/codelet_dlag2h.c

diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h
index 0773e0035..cf655f41f 100644
--- a/gpucublas/include/gpucublas/gpucublas_z.h
+++ b/gpucublas/include/gpucublas/gpucublas_z.h
@@ -11,10 +11,10 @@
  *
  * @brief Chameleon GPU CHAMELEON_Complex64_t kernels header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
- * @date 2022-02-22
+ * @date 2023-07-06
  * @precisions normal z -> c d s
  *
  */
@@ -24,6 +24,8 @@
 /**
  *  Declarations of cuda kernels - alphabetical order
  */
+int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle );
+int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle );
 int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
 int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle );
 int CUDA_zgemm(  cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle );
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index f7c00783b..9711e7934 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -17,7 +17,7 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Cedric Castagnede
 #  @author Emmanuel Agullo
 #  @author Mathieu Faverge
@@ -26,7 +26,7 @@
 #  @author Matthieu Kuhn
 #  @author Loris Lucido
 #  @author Terry Cojean
-#  @date 2023-01-30
+#  @date 2023-07-06
 #
 ###
 cmake_minimum_required(VERSION 3.1)
@@ -242,6 +242,7 @@ set(RUNTIME_SRCS_GENERATED "")
 set(ZSRC
   codelets/codelet_zcallback.c
   codelets/codelet_zccallback.c
+  codelets/codelet_dlag2h.c
   ${CODELETS_ZSRC}
   )
 
diff --git a/runtime/starpu/codelets/codelet_dlag2h.c b/runtime/starpu/codelets/codelet_dlag2h.c
new file mode 100644
index 000000000..b910559db
--- /dev/null
+++ b/runtime/starpu/codelets/codelet_dlag2h.c
@@ -0,0 +1,161 @@
+/**
+ *
+ * @file starpu/codelet_dlag2h.c
+ *
+ * @copyright 2009-2014 The University of Tennessee and The University of
+ *                      Tennessee Research Foundation. All rights reserved.
+ * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon dlag2h StarPU codelet
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @author Emmanuel Agullo
+ * @author Cedric Castagnede
+ * @author Lucas Barros de Assis
+ * @author Florent Pruvost
+ * @author Samuel Thibault
+ * @date 2023-07-06
+ * @precisions normal d -> d s
+ *
+ */
+#include "chameleon_starpu.h"
+#include "runtime_codelet_d.h"
+
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_dlag2h_cuda_func( void *descr[], void *cl_arg )
+{
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t   *tileA;
+    CHAM_tile_t   *tileB;
+    int            m, n;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+    starpu_codelet_unpack_args( cl_arg, &m, &n );
+
+    int rc = CUDA_dlag2h(
+        m, n,
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        handle );
+
+    if ( rc != 0 ) {
+        fprintf( stderr, "core_dlag2h failed with info(%d)\n", rc );
+    }
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS( dlag2h, NULL, cl_dlag2h_cuda_func, STARPU_CUDA_ASYNC )
+
+/**
+ *
+ * @ingroup INSERT_TASK_Complex64_t
+ *
+ */
+void INSERT_TASK_dlag2h( const RUNTIME_option_t *options,
+                         int m, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
+{
+    (void)nb;
+    struct starpu_codelet *codelet = &cl_dlag2h;
+    void (*callback)(void*) = options->profiling ? cl_dlag2h_callback : NULL;
+
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R(A, Am, An);
+    CHAMELEON_ACCESS_W(B, Bm, Bn);
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,    &m,                 sizeof(int),
+        STARPU_VALUE,    &n,                 sizeof(int),
+        STARPU_R,         RTBLKADDR(A, ChamRealDouble, Am, An),
+        STARPU_W,         RTBLKADDR(B, ChamRealHalf, Bm, Bn),
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "dlag2h",
+#endif
+        0);
+}
+
+#if !defined(CHAMELEON_SIMULATION)
+#if defined(CHAMELEON_USE_CUDA)
+static void
+cl_hlag2d_cuda_func( void *descr[], void *cl_arg )
+{
+    cublasHandle_t handle = starpu_cublas_get_local_handle();
+    CHAM_tile_t   *tileA;
+    CHAM_tile_t   *tileB;
+    int            m, n;
+
+    tileA = cti_interface_get(descr[0]);
+    tileB = cti_interface_get(descr[1]);
+
+    assert( tileA->format & CHAMELEON_TILE_FULLRANK );
+    assert( tileB->format & CHAMELEON_TILE_FULLRANK );
+
+    starpu_codelet_unpack_args( cl_arg, &m, &n );
+
+    int rc = CUDA_hlag2d(
+        m, n,
+        tileA->mat, tileA->ld,
+        tileB->mat, tileB->ld,
+        handle );
+
+    if ( rc != 0 ) {
+        fprintf( stderr, "core_hlag2d failed with info(%d)\n", rc );
+    }
+}
+#endif /* defined(CHAMELEON_USE_CUDA) */
+#endif /* !defined(CHAMELEON_SIMULATION) */
+
+/*
+ * Codelet definition
+ */
+CODELETS( hlag2d, NULL, cl_hlag2d_cuda_func, STARPU_CUDA_ASYNC )
+
+void INSERT_TASK_hlag2d( const RUNTIME_option_t *options,
+                         int m, int n, int nb,
+                         const CHAM_desc_t *A, int Am, int An,
+                         const CHAM_desc_t *B, int Bm, int Bn )
+{
+    (void)nb;
+    struct starpu_codelet *codelet = &cl_hlag2d;
+    void (*callback)(void*) = options->profiling ? cl_hlag2d_callback : NULL;
+
+    CHAMELEON_BEGIN_ACCESS_DECLARATION;
+    CHAMELEON_ACCESS_R( A, Am, An );
+    CHAMELEON_ACCESS_W( B, Bm, Bn );
+    CHAMELEON_END_ACCESS_DECLARATION;
+
+    rt_starpu_insert_task(
+        codelet,
+        STARPU_VALUE,    &m,                 sizeof(int),
+        STARPU_VALUE,    &n,                 sizeof(int),
+        STARPU_R,         RTBLKADDR(A, ChamComplexFloat, Am, An),
+        STARPU_W,         RTBLKADDR(B, ChamComplexDouble, Bm, Bn),
+        STARPU_PRIORITY,  options->priority,
+        STARPU_CALLBACK,  callback,
+        STARPU_EXECUTE_ON_WORKER, options->workerid,
+#if defined(CHAMELEON_CODELETS_HAVE_NAME)
+        STARPU_NAME, "hlag2d",
+#endif
+        0);
+}
diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c
index 8f05509f7..7d1bf87f6 100644
--- a/runtime/starpu/codelets/codelet_zcallback.c
+++ b/runtime/starpu/codelets/codelet_zcallback.c
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon zcallback StarPU codelet
  *
- * @version 1.2.0
+ * @version 1.3.0
  *  @author Mathieu Faverge
  *  @author Cedric Augonnet
  *  @author Florent Pruvost
  *  @author Alycia Lisito
- *  @date 2022-02-22
+ *  @date 2023-07-06
  *  @precisions normal z -> c d s
  *
  */
@@ -84,3 +84,7 @@ CHAMELEON_CL_CB(ztsmqr_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_ge
 CHAMELEON_CL_CB(ztstrf,        cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]),         M* M*M)
 CHAMELEON_CL_CB(zunmlq,        cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]),     2. *M* M*M)
 CHAMELEON_CL_CB(zunmqr,        cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]),     2. *M* M*M)
+#if defined(PRECISION_d) || defined(PRECISION_s)
+CHAMELEON_CL_CB(dlag2h,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
+CHAMELEON_CL_CB(hlag2d,        cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0,                                      M*N)
+#endif
diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h
index 1147a2b15..03f2dee93 100644
--- a/runtime/starpu/include/runtime_codelet_z.h
+++ b/runtime/starpu/include/runtime_codelet_z.h
@@ -134,4 +134,9 @@ CODELETS_HEADER(zsytrf_nopiv);
 #endif
 CODELETS_HEADER(zplgsy);
 
+#if defined(PRECISION_d) || defined(PRECISION_s)
+CODELETS_HEADER(dlag2h);
+CODELETS_HEADER(hlag2d);
+#endif
+
 #endif /* _runtime_codelet_z_h_ */
-- 
GitLab