gpus: Add the hgemm kernel for cuda and hip

ae7bbb63 · Mathieu Faverge · 0036dfaa · ae7bbb63 · ae7bbb63 · ae7bbb63
Commit ae7bbb63 authored 2 years ago by Mathieu Faverge
--- a/gpucublas/compute/CMakeLists.txt
+++ b/gpucublas/compute/CMakeLists.txt
@@ -291,9 +291,13 @@ precisions_rules_py(
 set(GPUCUBLAS_SRCS
  ${GPUCUBLAS_SRCS_GENERATED}
+  cuda_hgemm.c
  cudaglobal.c
  )
+# Need to use CXX compiler to have the __half support and access to cublasHgemm()
+set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
 # Force generation of sources
 # ---------------------------
 add_custom_target(gpucublas_sources ALL SOURCES ${GPUCUBLAS_SRCS})

--- a/gpucublas/compute/cuda_hgemm.c
+++ b/gpucublas/compute/cuda_hgemm.c
+/**
+ *
+ * @file cuda_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon cuda_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpucublas.h"
+extern "C" int
+CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+            int m, int n, int k,
+            const CHAMELEON_Real16_t *alpha,
+            const CHAMELEON_Real16_t *A, int lda,
+            const CHAMELEON_Real16_t *B, int ldb,
+            const CHAMELEON_Real16_t *beta,
+            CHAMELEON_Real16_t *C, int ldc,
+            cublasHandle_t handle )
+{
+    cublasStatus_t rc;
+    rc = cublasHgemm( handle,
+                      (cublasOperation_t)chameleon_cublas_const(transa),
+                      (cublasOperation_t)chameleon_cublas_const(transb),
+                      m, n, k,
+                      CUBLAS_VALUE(alpha), A, lda,
+                                           B, ldb,
+                      CUBLAS_VALUE(beta),  C, ldc);
+    assert( rc == CUBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
--- a/gpucublas/include/gpucublas.h
+++ b/gpucublas/include/gpucublas.h
@@ -11,11 +11,11 @@
 *
 * @brief Chameleon GPU kernels main header
 *
- * @version 1.2.0
+ * @version 1.3.0
 * @author Florent Pruvost
 * @author Mathieu Faverge
 * @author Nathalie Furmento
- * @date 2022-02-22
+ * @date 2023-07-04
 * @precisions normal z -> c d s
 *
 */
@@ -35,6 +35,9 @@
 #include <cuda.h>
 #include <cuComplex.h>
+#if CUDA_VERSION >= 7500
+#include <cuda_fp16.h>
+#endif
 #include <cublas_v2.h>
@@ -58,6 +61,15 @@ BEGIN_C_DECLS
 #include "gpucublas/gpucublas_c.h"
 #include "gpucublas/gpucublas_s.h"
+int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
+                int m, int n, int k,
+                const CHAMELEON_Real16_t *alpha,
+                const CHAMELEON_Real16_t *A, int lda,
+                const CHAMELEON_Real16_t *B, int ldb,
+                const CHAMELEON_Real16_t *beta,
+                CHAMELEON_Real16_t *C, int ldc,
+                cublasHandle_t handle );
 END_C_DECLS
 /**

--- a/gpuhipblas/compute/CMakeLists.txt
+++ b/gpuhipblas/compute/CMakeLists.txt
@@ -17,12 +17,12 @@
 #     Univ. of California Berkeley,
 #     Univ. of Colorado Denver.
 #
-# @version 1.2.0
+# @version 1.3.0
 #  @author Florent Pruvost
 #  @author Guillaume Sylvand
 #  @author Mathieu Faverge
 #  @author Loris Lucido
-#  @date 2023-01-30
+#  @date 2023-07-04
 #
 ###
@@ -47,6 +47,7 @@ precisions_rules_py(
 set(GPUHIPBLAS_SRCS
  ${GPUHIPBLAS_SRCS_GENERATED}
+  hip_hgemm.c
  hipglobal.c
  )

--- a/gpuhipblas/compute/hip_hgemm.c
+++ b/gpuhipblas/compute/hip_hgemm.c
+/**
+ *
+ * @file hip_hgemm.c
+ *
+ * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria,
+ *                      Univ. Bordeaux. All rights reserved.
+ *
+ ***
+ *
+ * @brief Chameleon hip_hgemm GPU kernel
+ *
+ * @version 1.3.0
+ * @author Mathieu Faverge
+ * @date 2023-07-04
+ *
+ */
+#include "gpuhipblas.h"
+int
+HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+           int m, int n, int k,
+           const CHAMELEON_Real16_t *alpha,
+           const CHAMELEON_Real16_t *A, int lda,
+           const CHAMELEON_Real16_t *B, int ldb,
+           const CHAMELEON_Real16_t *beta,
+           CHAMELEON_Real16_t *C, int ldc,
+           hipblasHandle_t handle )
+{
+    hipblasStatus_t rc;
+    rc = hipblasHgemm( handle,
+                       chameleon_hipblas_const(transa), chameleon_hipblas_const(transb),
+                       m, n, k,
+                       HIPBLAS_VALUE(alpha), A, lda,
+                                             B, ldb,
+                       HIPBLAS_VALUE(beta),  C, ldc );
+    assert( rc == HIPBLAS_STATUS_SUCCESS );
+    (void)rc;
+    return CHAMELEON_SUCCESS;
+}
--- a/gpuhipblas/include/gpuhipblas.h
+++ b/gpuhipblas/include/gpuhipblas.h
@@ -11,12 +11,12 @@
 *
 * @brief Chameleon GPU kernels main header
 *
- * @version 1.2.0
+ * @version 1.3.0
 * @author Florent Pruvost
 * @author Mathieu Faverge
 * @author Nathalie Furmento
 * @author Loris Lucido
- * @date 2023-01-30
+ * @date 2023-07-04
 * @precisions normal z -> c d s
 *
 */
@@ -36,6 +36,7 @@
 #include <hip/hip_runtime.h>
 #include <hip/hip_complex.h>
+#include <hip/hip_fp16.h>
 #include <hipblas/hipblas.h>
@@ -59,6 +60,15 @@ BEGIN_C_DECLS
 #include "gpuhipblas/gpuhipblas_c.h"
 #include "gpuhipblas/gpuhipblas_s.h"
+int HIP_hgemm( cham_trans_t transa, cham_trans_t transb,
+               int m, int n, int k,
+               const CHAMELEON_Real16_t *alpha,
+               const CHAMELEON_Real16_t *A, int lda,
+               const CHAMELEON_Real16_t *B, int ldb,
+               const CHAMELEON_Real16_t *beta,
+               CHAMELEON_Real16_t *C, int ldc,
+               hipblasHandle_t handle );
 END_C_DECLS
 /**