From ae7bbb634ced89e9fdb6ea6c360dc2e88e6201b7 Mon Sep 17 00:00:00 2001
From: Mathieu Faverge <mathieu.faverge@inria.fr>
Date: Tue, 23 May 2023 14:41:30 -0400
Subject: [PATCH] gpus: Add the hgemm kernel for cuda and hip

---
 gpucublas/compute/CMakeLists.txt  |  4 +++
 gpucublas/compute/cuda_hgemm.c    | 42 +++++++++++++++++++++++++++++++
 gpucublas/include/gpucublas.h     | 16 ++++++++++--
 gpuhipblas/compute/CMakeLists.txt |  5 ++--
 gpuhipblas/compute/hip_hgemm.c    | 41 ++++++++++++++++++++++++++++++
 gpuhipblas/include/gpuhipblas.h   | 14 +++++++++--
 6 files changed, 116 insertions(+), 6 deletions(-)
 create mode 100644 gpucublas/compute/cuda_hgemm.c
 create mode 100644 gpuhipblas/compute/hip_hgemm.c

diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt
index d06b47928..81b998145 100644
--- a/gpucublas/compute/CMakeLists.txt
+++ b/gpucublas/compute/CMakeLists.txt
@@ -291,9 +291,13 @@ precisions_rules_py(
 
 set(GPUCUBLAS_SRCS
   ${GPUCUBLAS_SRCS_GENERATED}
+  cuda_hgemm.c
   cudaglobal.c
   )
 
+# Need to use CXX compiler to have the __half support and access to cublasHgemm()
+set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
+
 # Force generation of sources
 # ---------------------------
 add_custom_target(gpucublas_sources ALL SOURCES ${GPUCUBLAS_SRCS})
diff --git a/gpucublas/compute/cuda_hgemm.c b/gpucublas/compute/cuda_hgemm.c
new file mode 100644
index 000000000..02f465a6a
--- /dev/null
+++ b/gpucublas/compute/cuda_hgemm.c
@@ -0,0 +1,42 @@
+/**
+ *
+ * @file cuda_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpucublas.h"
+
+extern "C" int
+CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+            int m, int n, int k,
+            const CHAMELEON_Real16_t *alpha,
+            const CHAMELEON_Real16_t *A, int lda,
+            const CHAMELEON_Real16_t *B, int ldb,
+            const CHAMELEON_Real16_t *beta,
+            CHAMELEON_Real16_t *C, int ldc,
+            cublasHandle_t handle )
+{
+    cublasStatus_t rc;
+
+    rc = cublasHgemm( handle,
+                      (cublasOperation_t)chameleon_cublas_const(transa),
+                      (cublasOperation_t)chameleon_cublas_const(transb),
+                      m, n, k,
+                      CUBLAS_VALUE(alpha), A, lda,
+                                           B, ldb,
+                      CUBLAS_VALUE(beta),  C, ldc);
+
+    assert( rc == CUBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h
index 8a9deb977..29a7046a8 100644
--- a/gpucublas/include/gpucublas.h
+++ b/gpucublas/include/gpucublas.h
@@ -11,11 +11,11 @@
  *
  * @brief Chameleon GPU kernels main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
  * @author Nathalie Furmento
- * @date 2022-02-22
+ * @date 2023-07-04
  * @precisions normal z -> c d s
  *
  */
@@ -35,6 +35,9 @@
 
 #include <cuda.h>
 #include <cuComplex.h>
+#if CUDA_VERSION >= 7500
+#include <cuda_fp16.h>
+#endif
 
 #include <cublas_v2.h>
 
@@ -58,6 +61,15 @@ BEGIN_C_DECLS
 #include "gpucublas/gpucublas_c.h"
 #include "gpucublas/gpucublas_s.h"
 
+int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+                int m, int n, int k,
+                const CHAMELEON_Real16_t *alpha,
+                const CHAMELEON_Real16_t *A, int lda,
+                const CHAMELEON_Real16_t *B, int ldb,
+                const CHAMELEON_Real16_t *beta,
+                CHAMELEON_Real16_t *C, int ldc,
+                cublasHandle_t handle );
+
 END_C_DECLS
 
 /**
diff --git a/gpuhipblas/compute/CMakeLists.txt b/gpuhipblas/compute/CMakeLists.txt
index bd7ac4540..cd3db9e9e 100644
--- a/gpuhipblas/compute/CMakeLists.txt
+++ b/gpuhipblas/compute/CMakeLists.txt
@@ -17,12 +17,12 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Mathieu Faverge
 #  @author Loris Lucido
-#  @date 2023-01-30
+#  @date 2023-07-04
 #
 ###
 
@@ -47,6 +47,7 @@ precisions_rules_py(
 
 set(GPUHIPBLAS_SRCS
   ${GPUHIPBLAS_SRCS_GENERATED}
+  hip_hgemm.c
   hipglobal.c
   )
 
diff --git a/gpuhipblas/compute/hip_hgemm.c b/gpuhipblas/compute/hip_hgemm.c
new file mode 100644
index 000000000..12d7feac2
--- /dev/null
+++ b/gpuhipblas/compute/hip_hgemm.c
@@ -0,0 +1,41 @@
+/**
+ *
+ * @file hip_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon hip_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpuhipblas.h"
+
+int
+HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+           int m, int n, int k,
+           const CHAMELEON_Real16_t *alpha,
+           const CHAMELEON_Real16_t *A, int lda,
+           const CHAMELEON_Real16_t *B, int ldb,
+           const CHAMELEON_Real16_t *beta,
+           CHAMELEON_Real16_t *C, int ldc,
+           hipblasHandle_t handle )
+{
+    hipblasStatus_t rc;
+
+    rc = hipblasHgemm( handle,
+                       chameleon_hipblas_const(transa), chameleon_hipblas_const(transb),
+                       m, n, k,
+                       HIPBLAS_VALUE(alpha), A, lda,
+                                             B, ldb,
+                       HIPBLAS_VALUE(beta),  C, ldc );
+
+    assert( rc == HIPBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
diff --git a/gpuhipblas/include/gpuhipblas.h b/gpuhipblas/include/gpuhipblas.h
index 572abcf96..705084958 100644
--- a/gpuhipblas/include/gpuhipblas.h
+++ b/gpuhipblas/include/gpuhipblas.h
@@ -11,12 +11,12 @@
  *
  * @brief Chameleon GPU kernels main header
  *
- * @version 1.2.0
+ * @version 1.3.0
  * @author Florent Pruvost
  * @author Mathieu Faverge
  * @author Nathalie Furmento
  * @author Loris Lucido
- * @date 2023-01-30
+ * @date 2023-07-04
  * @precisions normal z -> c d s
  *
  */
@@ -36,6 +36,7 @@
 
 #include <hip/hip_runtime.h>
 #include <hip/hip_complex.h>
+#include <hip/hip_fp16.h>
 
 #include <hipblas/hipblas.h>
 
@@ -59,6 +60,15 @@ BEGIN_C_DECLS
 #include "gpuhipblas/gpuhipblas_c.h"
 #include "gpuhipblas/gpuhipblas_s.h"
 
+int HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+               int m, int n, int k,
+               const CHAMELEON_Real16_t *alpha,
+               const CHAMELEON_Real16_t *A, int lda,
+               const CHAMELEON_Real16_t *B, int ldb,
+               const CHAMELEON_Real16_t *beta,
+               CHAMELEON_Real16_t *C, int ldc,
+               hipblasHandle_t handle );
+
 END_C_DECLS
 
 /**
-- 
GitLab