diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index fd7bf168984884808310088730f67eed8aad9a32..b90df480a365381097b434ad2d09ad414131b9af 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -144,3 +144,5 @@ subs = { ('hmat_p', 'hmat_s', 'hmat_d', 'hmat_c', 'hmat_z' ), ] } + +exceptfrom = [] diff --git a/gpucublas/CMakeLists.txt b/gpucublas/CMakeLists.txt index aae427010700699dfa138cc0654fb59fb8da200b..87584aa8ab04bc3e8790006fe4c0e621f8a35037 100644 --- a/gpucublas/CMakeLists.txt +++ b/gpucublas/CMakeLists.txt @@ -26,6 +26,48 @@ # ### +# Add CUDA kernel if compiler and toolkit are available +# ----------------------------------------------------- +include(CheckLanguage) +check_language(CUDA) + +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) +else() + message(STATUS "CUDA language is not supported") +endif() + +if (CUDAToolkit_FOUND) + set(GPUCUBLAS_HAVE_CUDA_TOOLKIT ON CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE) + + include(SetCMakeCudaArchitectures) + + if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "7.5") + set(GPUCUBLAS_HAVE_CUDA_HALF ON CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE) + else() + set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE) + endif() +else() + set(GPUCUBLAS_HAVE_CUDA_TOOLKIT OFF CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE) + set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE) +endif() + +if ( GPUCUBLAS_HAVE_CUDA_HALF ) + ##morse_cmake_required_set( CUBLAS ) + set(CMAKE_REQUIRED_LIBRARIES CUDA::CUBLAS) + + check_function_exists(cublasHgemm GPUCUBLAS_HAVE_CUBLASHGEMM) + if ( GPUCUBLAS_HAVE_CUBLASHGEMM ) + message("-- ${Blue}Add definition HAVE_CUBLASHGEMM${ColourReset}") + endif() + check_function_exists(cublasGemmEx GPUCUBLAS_HAVE_CUBLASGEMMEX) + if ( GPUCUBLAS_HAVE_CUBLASGEMMEX ) + message("-- ${Blue}Add definition HAVE_CUBLASGEMMEX${ColourReset}") + endif() + morse_cmake_required_unset() +endif() + add_subdirectory(include) add_subdirectory(compute) add_subdirectory(eztrace_module) diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt index 80828b2e23300b0c825d387a90ac334e8a6c5777..556e18827383495bb5d688092dd985fc6eb6f840 100644 --- a/gpucublas/compute/CMakeLists.txt +++ b/gpucublas/compute/CMakeLists.txt @@ -56,24 +56,15 @@ set(ZSRC cuda_zunmqrt.c ) -# Add CUDA kernel if compiler and toolkit are available -# ----------------------------------------------------- -include(CheckLanguage) -check_language(CUDA) - -if(CMAKE_CUDA_COMPILER) - enable_language(CUDA) - find_package(CUDAToolkit) -else() - message(STATUS "CUDA language is not supported") -endif() - -if (CUDAToolkit_FOUND) - include(SetCMakeCudaArchitectures) - +if ( GPUCUBLAS_HAVE_CUDA_TOOLKIT ) set(ZSRC ${ZSRC} cuda_zlag2c.cu + ) +endif() +if ( GPUCUBLAS_HAVE_CUDA_HALF ) + set(ZSRC + ${ZSRC} cuda_dlag2h.cu ) endif() @@ -102,13 +93,24 @@ precisions_rules_py( set(GPUCUBLAS_SRCS ${GPUCUBLAS_SRCS_GENERATED} - cuda_hgemm.c - cuda_gemmex.c cudaglobal.c +) + +if (GPUCUBLAS_HAVE_CUBLASHGEMM) + set(GPUCUBLAS_SRCS + ${GPUCUBLAS_SRCS} + cuda_hgemm.c ) + # Need to use CXX compiler to have the __half support and access to cublasHgemm() + set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX ) +endif() -# Need to use CXX compiler to have the __half support and access to cublasHgemm() -set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX ) +if (GPUCUBLAS_HAVE_CUBLASGEMMEX) + set(GPUCUBLAS_SRCS + ${GPUCUBLAS_SRCS} + cuda_gemmex.c + ) +endif() # Force generation of sources # --------------------------- diff --git a/gpucublas/compute/cuda_gemmex.c b/gpucublas/compute/cuda_gemmex.c index 9384e39811ecdf503f3f73bcb506cbe2a3dcace9..b7dccdbf28da03eeb57173a792c0141680c627a3 100644 --- a/gpucublas/compute/cuda_gemmex.c +++ b/gpucublas/compute/cuda_gemmex.c @@ -16,6 +16,10 @@ */ #include "gpucublas.h" +#if !defined(GPUCUBLAS_HAVE_CUBLASGEMMEX) +#error "This file should not be compiled" +#endif + int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, diff --git a/gpucublas/compute/cuda_hgemm.c b/gpucublas/compute/cuda_hgemm.c index a4898d24f1eb497eac0f6461c8b0654b3ffbbefb..85088d04d4510df0c54493aa72411650cd2883c6 100644 --- a/gpucublas/compute/cuda_hgemm.c +++ b/gpucublas/compute/cuda_hgemm.c @@ -16,6 +16,10 @@ */ #include "gpucublas.h" +#if !defined(GPUCUBLAS_HAVE_CUBLASHGEMM) +#error "This file should not be compiled" +#endif + extern "C" int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, diff --git a/gpucublas/include/CMakeLists.txt b/gpucublas/include/CMakeLists.txt index 35b0dd0b6faaf99d45fdfb1f556cd744666974e9..a2cb62b5cf9cb41573c561bc5c359f2d3f023a54 100644 --- a/gpucublas/include/CMakeLists.txt +++ b/gpucublas/include/CMakeLists.txt @@ -38,9 +38,13 @@ precisions_rules_py( # Define the list of headers # -------------------------- +configure_file("gpucublas.h.in" + "gpucublas.h" + @ONLY) + set(GPUCUBLAS_HDRS - gpucublas.h - ) + ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h +) # Add generated headers # --------------------- @@ -55,7 +59,7 @@ set(CHAMELEON_SOURCES_TARGETS "${CHAMELEON_SOURCES_TARGETS};gpucublas_include" C # Installation # ------------ -install( FILES gpucublas.h +install( FILES ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h DESTINATION include ) install( FILES ${GPUCUBLAS_HDRS} diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h.in similarity index 93% rename from gpucublas/include/gpucublas.h rename to gpucublas/include/gpucublas.h.in index b9a0ecf3ea56255367f13524bccd423368cd5475..451a494f21e77bc5cb3f77dccec21520f5d43082 100644 --- a/gpucublas/include/gpucublas.h +++ b/gpucublas/include/gpucublas.h.in @@ -52,6 +52,11 @@ #include "chameleon/struct.h" #include "chameleon/constants.h" +#cmakedefine GPUCUBLAS_HAVE_CUDA_TOOLKIT +#cmakedefine GPUCUBLAS_HAVE_CUDA_HALF +#cmakedefine GPUCUBLAS_HAVE_CUBLASHGEMM +#cmakedefine GPUCUBLAS_HAVE_CUBLASGEMMEX + /** * CUDA BLAS headers */ @@ -64,6 +69,7 @@ BEGIN_C_DECLS #include "gpucublas/gpucublas_zc.h" #include "gpucublas/gpucublas_ds.h" +#if defined(GPUCUBLAS_HAVE_CUBLASHGEMM) int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const CHAMELEON_Real16_t *alpha, @@ -72,7 +78,9 @@ int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, const CHAMELEON_Real16_t *beta, CHAMELEON_Real16_t *C, int ldc, cublasHandle_t handle ); +#endif +#if defined(GPUCUBLAS_HAVE_CUBLASGEMMEX) int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const void *alpha, @@ -81,6 +89,7 @@ int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, const void *beta, void *C, int ldc, cham_flttype_t Ctype, cublasHandle_t handle ); +#endif static inline cublasComputeType_t chameleon_cublas_ctype( cham_flttype_t flttype ) { diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h index fa04fc878084a891c90d9a04a871ba91f38b0fb3..6a44ee9f967192d714c471775a4857b5015fc058 100644 --- a/gpucublas/include/gpucublas/gpucublas_z.h +++ b/gpucublas/include/gpucublas/gpucublas_z.h @@ -24,8 +24,10 @@ /** * Declarations of cuda kernels - alphabetical order */ +#if defined(GPUCUBLAS_HAVE_CUDA_HALF) int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle ); int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle ); +#endif int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); diff --git a/gpucublas/include/gpucublas/gpucublas_zc.h b/gpucublas/include/gpucublas/gpucublas_zc.h index e0140cfa3a7700ce22756749c11a577beeadf0bc..ee29dcefecdd388ee412cc61acc69b2b6381f8de 100644 --- a/gpucublas/include/gpucublas/gpucublas_zc.h +++ b/gpucublas/include/gpucublas/gpucublas_zc.h @@ -18,7 +18,9 @@ #ifndef _gpucublas_zc_h_ #define _gpucublas_zc_h_ +#if defined(GPUCUBLAS_HAVE_CUDA_TOOLKIT) int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle ); +#endif #endif /* _gpucublas_zc_h_ */ diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 633013b28911d36172b6acc64cb1492f8fb6068e..7f2eab94cd34acadf580ed42f5c858562d927ae4 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -252,9 +252,15 @@ set(RUNTIME_SRCS_GENERATED "") set(ZSRC codelets/codelet_zcallback.c codelets/codelet_zccallback.c - codelets/codelet_dlag2h.c ${CODELETS_ZSRC} +) + +if(GPUCUBLAS_HAVE_CUDA_HALF OR CHAMELEON_SIMULATION) + set(ZSRC + ${ZSRC} + codelets/codelet_dlag2h.c ) +endif() precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" PRECISIONS "${CHAMELEON_PRECISION}" @@ -262,11 +268,23 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}" set(CODELETS_SRC codelets/codelet_convert.c - codelets/codelet_hgemm.c codelets/codelet_gemm.c ${CODELETS_SRC} ) +if(GPUCUBLAS_HAVE_CUBLASHGEMM OR CHAMELEON_SIMULATION) + set(CODELETS_SRC + codelets/codelet_hgemm.c + ${CODELETS_SRC} + ) +endif() +if(GPUCUBLAS_HAVE_CUBLASGEMMEX OR CHAMELEON_SIMULATION) + set(CODELETS_SRC + codelets/codelet_gemmex.c + ${CODELETS_SRC} + ) +endif() + set(RUNTIME_SRCS ${RUNTIME_COMMON} ${RUNTIME_SRCS_GENERATED} diff --git a/runtime/starpu/codelets/codelet_convert.c b/runtime/starpu/codelets/codelet_convert.c index 8a637a493c00b0169b1aedae56c043070c93fcc4..8a8b34fccc628069e41d23ed3eb2ad11fa6942b7 100644 --- a/runtime/starpu/codelets/codelet_convert.c +++ b/runtime/starpu/codelets/codelet_convert.c @@ -75,7 +75,7 @@ insert_task_convert( const RUNTIME_option_t *options, break; #endif -#if defined(CHAMELEON_PREC_D) && defined(CHAMELON_USE_CUDA) +#if defined(CHAMELEON_PREC_D) && defined(GPUCUBLAS_HAVE_CUDA_HALF) case ChamConvertRealDoubleToHalf: codelet = &cl_dlag2h; callback = cl_dlag2h_callback; @@ -87,7 +87,7 @@ insert_task_convert( const RUNTIME_option_t *options, break; #endif -#if defined(CHAMELEON_PREC_S) && defined(CHAMELON_USE_CUDA) +#if defined(CHAMELEON_PREC_S) && defined(GPUCUBLAS_HAVE_CUDA_HALF) case ChamConvertRealSingleToHalf: codelet = &cl_slag2h; callback = cl_slag2h_callback;