diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py
index fd7bf168984884808310088730f67eed8aad9a32..b90df480a365381097b434ad2d09ad414131b9af 100644
--- a/cmake_modules/local_subs.py
+++ b/cmake_modules/local_subs.py
@@ -144,3 +144,5 @@ subs = {
         ('hmat_p',               'hmat_s',               'hmat_d',               'hmat_c',               'hmat_z'              ),
     ]
 }
+
+exceptfrom = []
diff --git a/gpucublas/CMakeLists.txt b/gpucublas/CMakeLists.txt
index aae427010700699dfa138cc0654fb59fb8da200b..87584aa8ab04bc3e8790006fe4c0e621f8a35037 100644
--- a/gpucublas/CMakeLists.txt
+++ b/gpucublas/CMakeLists.txt
@@ -26,6 +26,48 @@
 #
 ###
 
+# Add CUDA kernel if compiler and toolkit are available
+# -----------------------------------------------------
+include(CheckLanguage)
+check_language(CUDA)
+
+if(CMAKE_CUDA_COMPILER)
+  enable_language(CUDA)
+  find_package(CUDAToolkit)
+else()
+  message(STATUS "CUDA language is not supported")
+endif()
+
+if (CUDAToolkit_FOUND)
+  set(GPUCUBLAS_HAVE_CUDA_TOOLKIT ON CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE)
+
+  include(SetCMakeCudaArchitectures)
+
+  if (CUDAToolkit_VERSION VERSION_GREATER_EQUAL "7.5")
+    set(GPUCUBLAS_HAVE_CUDA_HALF ON CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
+  else()
+    set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
+  endif()
+else()
+  set(GPUCUBLAS_HAVE_CUDA_TOOLKIT OFF CACHE INTERNAL "Indicate if cuda kernels are enabled or not" FORCE)
+  set(GPUCUBLAS_HAVE_CUDA_HALF OFF CACHE INTERNAL "Indicate if half precision support is enabled or not" FORCE)
+endif()
+
+if ( GPUCUBLAS_HAVE_CUDA_HALF )
+  ##morse_cmake_required_set( CUBLAS )
+  set(CMAKE_REQUIRED_LIBRARIES CUDA::CUBLAS)
+
+  check_function_exists(cublasHgemm GPUCUBLAS_HAVE_CUBLASHGEMM)
+  if ( GPUCUBLAS_HAVE_CUBLASHGEMM )
+    message("-- ${Blue}Add definition HAVE_CUBLASHGEMM${ColourReset}")
+  endif()
+  check_function_exists(cublasGemmEx GPUCUBLAS_HAVE_CUBLASGEMMEX)
+  if ( GPUCUBLAS_HAVE_CUBLASGEMMEX )
+    message("-- ${Blue}Add definition HAVE_CUBLASGEMMEX${ColourReset}")
+  endif()
+  morse_cmake_required_unset()
+endif()
+
 add_subdirectory(include)
 add_subdirectory(compute)
 add_subdirectory(eztrace_module)
diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt
index 80828b2e23300b0c825d387a90ac334e8a6c5777..556e18827383495bb5d688092dd985fc6eb6f840 100644
--- a/gpucublas/compute/CMakeLists.txt
+++ b/gpucublas/compute/CMakeLists.txt
@@ -56,24 +56,15 @@ set(ZSRC
     cuda_zunmqrt.c
     )
 
-# Add CUDA kernel if compiler and toolkit are available
-# -----------------------------------------------------
-include(CheckLanguage)
-check_language(CUDA)
-
-if(CMAKE_CUDA_COMPILER)
-  enable_language(CUDA)
-  find_package(CUDAToolkit)
-else()
-  message(STATUS "CUDA language is not supported")
-endif()
-
-if (CUDAToolkit_FOUND)
-  include(SetCMakeCudaArchitectures)
-
+if ( GPUCUBLAS_HAVE_CUDA_TOOLKIT )
   set(ZSRC
     ${ZSRC}
     cuda_zlag2c.cu
+  )
+endif()
+if ( GPUCUBLAS_HAVE_CUDA_HALF )
+  set(ZSRC
+    ${ZSRC}
     cuda_dlag2h.cu
   )
 endif()
@@ -102,13 +93,24 @@ precisions_rules_py(
 
 set(GPUCUBLAS_SRCS
   ${GPUCUBLAS_SRCS_GENERATED}
-  cuda_hgemm.c
-  cuda_gemmex.c
   cudaglobal.c
+)
+
+if (GPUCUBLAS_HAVE_CUBLASHGEMM)
+  set(GPUCUBLAS_SRCS
+    ${GPUCUBLAS_SRCS}
+    cuda_hgemm.c
   )
+  # Need to use CXX compiler to have the __half support and access to cublasHgemm()
+  set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
+endif()
 
-# Need to use CXX compiler to have the __half support and access to cublasHgemm()
-set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX )
+if (GPUCUBLAS_HAVE_CUBLASGEMMEX)
+  set(GPUCUBLAS_SRCS
+    ${GPUCUBLAS_SRCS}
+    cuda_gemmex.c
+  )
+endif()
 
 # Force generation of sources
 # ---------------------------
diff --git a/gpucublas/compute/cuda_gemmex.c b/gpucublas/compute/cuda_gemmex.c
index 9384e39811ecdf503f3f73bcb506cbe2a3dcace9..b7dccdbf28da03eeb57173a792c0141680c627a3 100644
--- a/gpucublas/compute/cuda_gemmex.c
+++ b/gpucublas/compute/cuda_gemmex.c
@@ -16,6 +16,10 @@
  */
 #include "gpucublas.h"
 
+#if !defined(GPUCUBLAS_HAVE_CUBLASGEMMEX)
+#error "This file should not be compiled"
+#endif
+
 int
 CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
              int m, int n, int k,
diff --git a/gpucublas/compute/cuda_hgemm.c b/gpucublas/compute/cuda_hgemm.c
index a4898d24f1eb497eac0f6461c8b0654b3ffbbefb..85088d04d4510df0c54493aa72411650cd2883c6 100644
--- a/gpucublas/compute/cuda_hgemm.c
+++ b/gpucublas/compute/cuda_hgemm.c
@@ -16,6 +16,10 @@
  */
 #include "gpucublas.h"
 
+#if !defined(GPUCUBLAS_HAVE_CUBLASHGEMM)
+#error "This file should not be compiled"
+#endif
+
 extern "C" int
 CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
             int m, int n, int k,
diff --git a/gpucublas/include/CMakeLists.txt b/gpucublas/include/CMakeLists.txt
index 35b0dd0b6faaf99d45fdfb1f556cd744666974e9..a2cb62b5cf9cb41573c561bc5c359f2d3f023a54 100644
--- a/gpucublas/include/CMakeLists.txt
+++ b/gpucublas/include/CMakeLists.txt
@@ -38,9 +38,13 @@ precisions_rules_py(
 
 # Define the list of headers
 # --------------------------
+configure_file("gpucublas.h.in"
+               "gpucublas.h"
+               @ONLY)
+
 set(GPUCUBLAS_HDRS
-    gpucublas.h
-    )
+  ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h
+)
 
 # Add generated headers
 # ---------------------
@@ -55,7 +59,7 @@ set(CHAMELEON_SOURCES_TARGETS "${CHAMELEON_SOURCES_TARGETS};gpucublas_include" C
 
 # Installation
 # ------------
-install( FILES gpucublas.h
+install( FILES ${CMAKE_CURRENT_BINARY_DIR}/gpucublas.h
          DESTINATION include )
 
 install( FILES ${GPUCUBLAS_HDRS}
diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h.in
similarity index 93%
rename from gpucublas/include/gpucublas.h
rename to gpucublas/include/gpucublas.h.in
index b9a0ecf3ea56255367f13524bccd423368cd5475..451a494f21e77bc5cb3f77dccec21520f5d43082 100644
--- a/gpucublas/include/gpucublas.h
+++ b/gpucublas/include/gpucublas.h.in
@@ -52,6 +52,11 @@
 #include "chameleon/struct.h"
 #include "chameleon/constants.h"
 
+#cmakedefine GPUCUBLAS_HAVE_CUDA_TOOLKIT
+#cmakedefine GPUCUBLAS_HAVE_CUDA_HALF
+#cmakedefine GPUCUBLAS_HAVE_CUBLASHGEMM
+#cmakedefine GPUCUBLAS_HAVE_CUBLASGEMMEX
+
 /**
  * CUDA BLAS headers
  */
@@ -64,6 +69,7 @@ BEGIN_C_DECLS
 #include "gpucublas/gpucublas_zc.h"
 #include "gpucublas/gpucublas_ds.h"
 
+#if defined(GPUCUBLAS_HAVE_CUBLASHGEMM)
 int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
                 int m, int n, int k,
                 const CHAMELEON_Real16_t *alpha,
@@ -72,7 +78,9 @@ int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb,
                 const CHAMELEON_Real16_t *beta,
                 CHAMELEON_Real16_t *C, int ldc,
                 cublasHandle_t handle );
+#endif
 
+#if defined(GPUCUBLAS_HAVE_CUBLASGEMMEX)
 int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
                  int m, int n, int k,
                  const void *alpha,
@@ -81,6 +89,7 @@ int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb,
                  const void *beta,
                  void *C, int ldc, cham_flttype_t Ctype,
                  cublasHandle_t handle );
+#endif
 
 static inline cublasComputeType_t
 chameleon_cublas_ctype( cham_flttype_t flttype ) {
diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h
index fa04fc878084a891c90d9a04a871ba91f38b0fb3..6a44ee9f967192d714c471775a4857b5015fc058 100644
--- a/gpucublas/include/gpucublas/gpucublas_z.h
+++ b/gpucublas/include/gpucublas/gpucublas_z.h
@@ -24,8 +24,10 @@
 /**
  *  Declarations of cuda kernels - alphabetical order
  */
+#if defined(GPUCUBLAS_HAVE_CUDA_HALF)
 int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle );
 int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle );
+#endif
 int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
 int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle );
 int CUDA_zgemm(  cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle );
diff --git a/gpucublas/include/gpucublas/gpucublas_zc.h b/gpucublas/include/gpucublas/gpucublas_zc.h
index e0140cfa3a7700ce22756749c11a577beeadf0bc..ee29dcefecdd388ee412cc61acc69b2b6381f8de 100644
--- a/gpucublas/include/gpucublas/gpucublas_zc.h
+++ b/gpucublas/include/gpucublas/gpucublas_zc.h
@@ -18,7 +18,9 @@
 #ifndef _gpucublas_zc_h_
 #define _gpucublas_zc_h_
 
+#if defined(GPUCUBLAS_HAVE_CUDA_TOOLKIT)
 int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle );
 int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle );
+#endif
 
 #endif /* _gpucublas_zc_h_ */
diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt
index 633013b28911d36172b6acc64cb1492f8fb6068e..7f2eab94cd34acadf580ed42f5c858562d927ae4 100644
--- a/runtime/starpu/CMakeLists.txt
+++ b/runtime/starpu/CMakeLists.txt
@@ -252,9 +252,15 @@ set(RUNTIME_SRCS_GENERATED "")
 set(ZSRC
   codelets/codelet_zcallback.c
   codelets/codelet_zccallback.c
-  codelets/codelet_dlag2h.c
   ${CODELETS_ZSRC}
+)
+
+if(GPUCUBLAS_HAVE_CUDA_HALF OR CHAMELEON_SIMULATION)
+  set(ZSRC
+    ${ZSRC}
+    codelets/codelet_dlag2h.c
   )
+endif()
 
 precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
   PRECISIONS "${CHAMELEON_PRECISION}"
@@ -262,11 +268,23 @@ precisions_rules_py(RUNTIME_SRCS_GENERATED "${ZSRC}"
 
 set(CODELETS_SRC
   codelets/codelet_convert.c
-  codelets/codelet_hgemm.c
   codelets/codelet_gemm.c
   ${CODELETS_SRC}
   )
 
+if(GPUCUBLAS_HAVE_CUBLASHGEMM OR CHAMELEON_SIMULATION)
+  set(CODELETS_SRC
+    codelets/codelet_hgemm.c
+    ${CODELETS_SRC}
+  )
+endif()
+if(GPUCUBLAS_HAVE_CUBLASGEMMEX OR CHAMELEON_SIMULATION)
+  set(CODELETS_SRC
+    codelets/codelet_gemmex.c
+    ${CODELETS_SRC}
+  )
+endif()
+
 set(RUNTIME_SRCS
   ${RUNTIME_COMMON}
   ${RUNTIME_SRCS_GENERATED}
diff --git a/runtime/starpu/codelets/codelet_convert.c b/runtime/starpu/codelets/codelet_convert.c
index 8a637a493c00b0169b1aedae56c043070c93fcc4..8a8b34fccc628069e41d23ed3eb2ad11fa6942b7 100644
--- a/runtime/starpu/codelets/codelet_convert.c
+++ b/runtime/starpu/codelets/codelet_convert.c
@@ -75,7 +75,7 @@ insert_task_convert( const RUNTIME_option_t *options,
         break;
 #endif
 
-#if defined(CHAMELEON_PREC_D) && defined(CHAMELON_USE_CUDA)
+#if defined(CHAMELEON_PREC_D) && defined(GPUCUBLAS_HAVE_CUDA_HALF)
     case ChamConvertRealDoubleToHalf:
         codelet = &cl_dlag2h;
         callback = cl_dlag2h_callback;
@@ -87,7 +87,7 @@ insert_task_convert( const RUNTIME_option_t *options,
         break;
 #endif
 
-#if defined(CHAMELEON_PREC_S) && defined(CHAMELON_USE_CUDA)
+#if defined(CHAMELEON_PREC_S) && defined(GPUCUBLAS_HAVE_CUDA_HALF)
     case ChamConvertRealSingleToHalf:
         codelet = &cl_slag2h;
         callback = cl_slag2h_callback;