Mentions légales du service

Skip to content
Snippets Groups Projects
Commit f4b27b54 authored by Mathieu Faverge's avatar Mathieu Faverge
Browse files

Merge branch 'issue127' into 'master'

Issue #127: Fix the protection of half precision kernels with cuda

Closes #127

See merge request !470
parents 806a2160 7cf96c83
No related branches found
No related tags found
1 merge request!470Issue #127: Fix the protection of half precision kernels with cuda
...@@ -144,3 +144,5 @@ subs = { ...@@ -144,3 +144,5 @@ subs = {
('hmat_p', 'hmat_s', 'hmat_d', 'hmat_c', 'hmat_z' ), ('hmat_p', 'hmat_s', 'hmat_d', 'hmat_c', 'hmat_z' ),
] ]
} }
exceptfrom = []
...@@ -26,6 +26,48 @@ ...@@ -26,6 +26,48 @@
# #
### ###
# Add CUDA kernel if compiler and toolkit are available
# -----------------------------------------------------
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
else()
message(STATUS "CUDA language is not supported")
endif()
if (CUDAToolkit_FOUND)
set(GPUCUBLAS_HAVE_CUDA_TOOLKIT ON CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE)
include(SetCMakeCudaArchitectures)
if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "7.5")
set(GPUCUBLAS_HAVE_CUDA_HALF ON CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
else()
set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
endif()
else()
set(GPUCUBLAS_HAVE_CUDA_TOOLKIT OFF CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE)
set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
endif()
if ( GPUCUBLAS_HAVE_CUDA_HALF )
##morse_cmake_required_set( CUBLAS )
set(CMAKE_REQUIRED_LIBRARIES CUDA::CUBLAS)
check_function_exists(cublasHgemm GPUCUBLAS_HAVE_CUBLASHGEMM)
if ( GPUCUBLAS_HAVE_CUBLASHGEMM )
message("-- ${Blue}Add definition HAVE_CUBLASHGEMM${ColourReset}")
endif()
check_function_exists(cublasGemmEx GPUCUBLAS_HAVE_CUBLASGEMMEX)
if ( GPUCUBLAS_HAVE_CUBLASGEMMEX )
message("-- ${Blue}Add definition HAVE_CUBLASGEMMEX${ColourReset}")
endif()
morse_cmake_required_unset()
endif()
add_subdirectory(include) add_subdirectory(include)
add_subdirectory(compute) add_subdirectory(compute)
add_subdirectory(eztrace_module) add_subdirectory(eztrace_module)
......
...@@ -56,24 +56,15 @@ set(ZSRC ...@@ -56,24 +56,15 @@ set(ZSRC
cuda_zunmqrt.c cuda_zunmqrt.c
) )
# Add CUDA kernel if compiler and toolkit are available if ( GPUCUBLAS_HAVE_CUDA_TOOLKIT )
# -----------------------------------------------------
include(CheckLanguage)
check_language(CUDA)
if(CMAKE_CUDA_COMPILER)
enable_language(CUDA)
find_package(CUDAToolkit)
else()
message(STATUS "CUDA language is not supported")
endif()
if (CUDAToolkit_FOUND)
include(SetCMakeCudaArchitectures)
set(ZSRC set(ZSRC
${ZSRC} ${ZSRC}
cuda_zlag2c.cu cuda_zlag2c.cu
)
endif()
if ( GPUCUBLAS_HAVE_CUDA_HALF )
set(ZSRC
${ZSRC}
cuda_dlag2h.cu cuda_dlag2h.cu
) )
endif() endif()
...@@ -102,13 +93,24 @@ precisions_rules_py( ...@@ -102,13 +93,24 @@ precisions_rules_py(
set(GPUCUBLAS_SRCS set(GPUCUBLAS_SRCS
${GPUCUBLAS_SRCS_GENERATED} ${GPUCUBLAS_SRCS_GENERATED}
cuda_hgemm.c
cuda_gemmex.c
cudaglobal.c cudaglobal.c
)
if (GPUCUBLAS_HAVE_CUBLASHGEMM)
set(GPUCUBLAS_SRCS
${GPUCUBLAS_SRCS}
cuda_hgemm.c
) )
# Need to use CXX compiler to have the __half support and access to cublasHgemm()
set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
endif()
# Need to use CXX compiler to have the __half support and access to cublasHgemm() if (GPUCUBLAS_HAVE_CUBLASGEMMEX)
set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX ) set(GPUCUBLAS_SRCS
${GPUCUBLAS_SRCS}
cuda_gemmex.c
)
endif()
# Force generation of sources # Force generation of sources
# --------------------------- # ---------------------------
......
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
*/ */
#include "gpucublas.h" #include "gpucublas.h"
#if !defined(GPUCUBLAS_HAVE_CUBLASGEMMEX)
#error "This file should not be compiled"
#endif
int int
CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
int m, int n, int k, int m, int n, int k,
......
...@@ -16,6 +16,10 @@ ...@@ -16,6 +16,10 @@
*/ */
#include "gpucublas.h" #include "gpucublas.h"
#if !defined(GPUCUBLAS_HAVE_CUBLASHGEMM)
#error "This file should not be compiled"
#endif
extern "C" int extern "C" int
CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
int m, int n, int k, int m, int n, int k,
......
...@@ -38,9 +38,13 @@ precisions_rules_py( ...@@ -38,9 +38,13 @@ precisions_rules_py(
# Define the list of headers # Define the list of headers
# -------------------------- # --------------------------
configure_file("gpucublas.h.in"
"gpucublas.h"
@ONLY)
set(GPUCUBLAS_HDRS set(GPUCUBLAS_HDRS
gpucublas.h ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h
) )
# Add generated headers # Add generated headers
# --------------------- # ---------------------
...@@ -55,7 +59,7 @@ set(CHAMELEON_SOURCES_TARGETS "${CHAMELEON_SOURCES_TARGETS};gpucublas_include" C ...@@ -55,7 +59,7 @@ set(CHAMELEON_SOURCES_TARGETS "${CHAMELEON_SOURCES_TARGETS};gpucublas_include" C
# Installation # Installation
# ------------ # ------------
install( FILES gpucublas.h install( FILES ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h
DESTINATION include ) DESTINATION include )
install( FILES ${GPUCUBLAS_HDRS} install( FILES ${GPUCUBLAS_HDRS}
......
...@@ -52,6 +52,11 @@ ...@@ -52,6 +52,11 @@
#include "chameleon/struct.h" #include "chameleon/struct.h"
#include "chameleon/constants.h" #include "chameleon/constants.h"
#cmakedefine GPUCUBLAS_HAVE_CUDA_TOOLKIT
#cmakedefine GPUCUBLAS_HAVE_CUDA_HALF
#cmakedefine GPUCUBLAS_HAVE_CUBLASHGEMM
#cmakedefine GPUCUBLAS_HAVE_CUBLASGEMMEX
/** /**
* CUDA BLAS headers * CUDA BLAS headers
*/ */
...@@ -64,6 +69,7 @@ BEGIN_C_DECLS ...@@ -64,6 +69,7 @@ BEGIN_C_DECLS
#include "gpucublas/gpucublas_zc.h" #include "gpucublas/gpucublas_zc.h"
#include "gpucublas/gpucublas_ds.h" #include "gpucublas/gpucublas_ds.h"
#if defined(GPUCUBLAS_HAVE_CUBLASHGEMM)
int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
int m, int n, int k, int m, int n, int k,
const CHAMELEON_Real16_t *alpha, const CHAMELEON_Real16_t *alpha,
...@@ -72,7 +78,9 @@ int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, ...@@ -72,7 +78,9 @@ int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
const CHAMELEON_Real16_t *beta, const CHAMELEON_Real16_t *beta,
CHAMELEON_Real16_t *C, int ldc, CHAMELEON_Real16_t *C, int ldc,
cublasHandle_t handle ); cublasHandle_t handle );
#endif
#if defined(GPUCUBLAS_HAVE_CUBLASGEMMEX)
int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
int m, int n, int k, int m, int n, int k,
const void *alpha, const void *alpha,
...@@ -81,6 +89,7 @@ int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, ...@@ -81,6 +89,7 @@ int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
const void *beta, const void *beta,
void *C, int ldc, cham_flttype_t Ctype, void *C, int ldc, cham_flttype_t Ctype,
cublasHandle_t handle ); cublasHandle_t handle );
#endif
static inline cublasComputeType_t static inline cublasComputeType_t
chameleon_cublas_ctype( cham_flttype_t flttype ) { chameleon_cublas_ctype( cham_flttype_t flttype ) {
......
...@@ -24,8 +24,10 @@ ...@@ -24,8 +24,10 @@
/** /**
* Declarations of cuda kernels - alphabetical order * Declarations of cuda kernels - alphabetical order
*/ */
#if defined(GPUCUBLAS_HAVE_CUDA_HALF)
int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle ); int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle );
int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle ); int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle );
#endif
int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle );
int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle );
......
...@@ -18,7 +18,9 @@ ...@@ -18,7 +18,9 @@
#ifndef _gpucublas_zc_h_ #ifndef _gpucublas_zc_h_
#define _gpucublas_zc_h_ #define _gpucublas_zc_h_
#if defined(GPUCUBLAS_HAVE_CUDA_TOOLKIT)
int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle );
#endif
#endif /* _gpucublas_zc_h_ */ #endif /* _gpucublas_zc_h_ */
...@@ -252,9 +252,15 @@ set(RUNTIME_SRCS_GENERATED "") ...@@ -252,9 +252,15 @@ set(RUNTIME_SRCS_GENERATED "")
set(ZSRC set(ZSRC
codelets/codelet_zcallback.c codelets/codelet_zcallback.c
codelets/codelet_zccallback.c codelets/codelet_zccallback.c
codelets/codelet_dlag2h.c
${CODELETS_ZSRC} ${CODELETS_ZSRC}
)
if(GPUCUBLAS_HAVE_CUDA_HALF OR CHAMELEON_SIMULATION)
set(ZSRC
${ZSRC}
codelets/codelet_dlag2h.c
) )
endif()
precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
PRECISIONS "${CHAMELEON_PRECISION}" PRECISIONS "${CHAMELEON_PRECISION}"
...@@ -262,11 +268,23 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" ...@@ -262,11 +268,23 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
set(CODELETS_SRC set(CODELETS_SRC
codelets/codelet_convert.c codelets/codelet_convert.c
codelets/codelet_hgemm.c
codelets/codelet_gemm.c codelets/codelet_gemm.c
${CODELETS_SRC} ${CODELETS_SRC}
) )
if(GPUCUBLAS_HAVE_CUBLASHGEMM OR CHAMELEON_SIMULATION)
set(CODELETS_SRC
codelets/codelet_hgemm.c
${CODELETS_SRC}
)
endif()
if(GPUCUBLAS_HAVE_CUBLASGEMMEX OR CHAMELEON_SIMULATION)
set(CODELETS_SRC
codelets/codelet_gemmex.c
${CODELETS_SRC}
)
endif()
set(RUNTIME_SRCS set(RUNTIME_SRCS
${RUNTIME_COMMON} ${RUNTIME_COMMON}
${RUNTIME_SRCS_GENERATED} ${RUNTIME_SRCS_GENERATED}
......
...@@ -75,7 +75,7 @@ insert_task_convert( const RUNTIME_option_t *options, ...@@ -75,7 +75,7 @@ insert_task_convert( const RUNTIME_option_t *options,
break; break;
#endif #endif
#if defined(CHAMELEON_PREC_D) && defined(CHAMELON_USE_CUDA) #if defined(CHAMELEON_PREC_D) && defined(GPUCUBLAS_HAVE_CUDA_HALF)
case ChamConvertRealDoubleToHalf: case ChamConvertRealDoubleToHalf:
codelet = &cl_dlag2h; codelet = &cl_dlag2h;
callback = cl_dlag2h_callback; callback = cl_dlag2h_callback;
...@@ -87,7 +87,7 @@ insert_task_convert( const RUNTIME_option_t *options, ...@@ -87,7 +87,7 @@ insert_task_convert( const RUNTIME_option_t *options,
break; break;
#endif #endif
#if defined(CHAMELEON_PREC_S) && defined(CHAMELON_USE_CUDA) #if defined(CHAMELEON_PREC_S) && defined(GPUCUBLAS_HAVE_CUDA_HALF)
case ChamConvertRealSingleToHalf: case ChamConvertRealSingleToHalf:
codelet = &cl_slag2h; codelet = &cl_slag2h;
callback = cl_slag2h_callback; callback = cl_slag2h_callback;
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment