diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 2b4d0a1db651921f94fe509cd5976800e69c162a..0968de66d35b777f2efbbc2d75ebfdbc9ccdb475 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -24,7 +24,9 @@ # @author Florent Pruvost # @author Samuel Thibault # @author Matthieu Kuhn -# @date 2022-02-22 +# @author Loris Lucido +# @author Terry Cojean +# @date 2023-01-30 # ### cmake_minimum_required(VERSION 3.1) @@ -147,6 +149,13 @@ if ( STARPU_FOUND ) endif() endif() + if (CHAMELEON_USE_HIP AND NOT CHAMELEON_SIMULATION) + check_function_exists(starpu_hipblas_get_local_handle HAVE_STARPU_HIPBLAS_GET_LOCAL_HANDLE) + if ( NOT HAVE_STARPU_HIPBLAS_GET_LOCAL_HANDLE ) + message(FATAL_ERROR "The detected StarPU library does not include hipblas support. Please disable CHAMELEON_USE_HIP_{CUDA,ROC} or provide a StarPU library with hipblas support.") + endif() + endif() + morse_cmake_required_unset() endif ( STARPU_FOUND ) @@ -268,6 +277,9 @@ if (NOT CHAMELEON_SIMULATION) if (CHAMELEON_USE_CUDA) add_dependencies(chameleon_starpu cudablas_include) endif() + if (CHAMELEON_USE_HIP) + add_dependencies(chameleon_starpu hipblas_include) + endif() endif() target_include_directories(chameleon_starpu PUBLIC @@ -294,6 +306,9 @@ if (NOT CHAMELEON_SIMULATION) if(CHAMELEON_USE_CUDA) target_link_libraries(chameleon_starpu PRIVATE cudablas) endif(CHAMELEON_USE_CUDA) + if(CHAMELEON_USE_HIP) + target_link_libraries(chameleon_starpu PRIVATE hipblas) + endif(CHAMELEON_USE_HIP) endif(NOT CHAMELEON_SIMULATION) target_link_libraries(chameleon_starpu PRIVATE MORSE::M) @@ -307,6 +322,9 @@ if (NOT CHAMELEON_SIMULATION) if (CHAMELEON_USE_CUDA) target_link_libraries(chameleon_starpu PUBLIC CUDA::CUBLAS) endif() + if (CHAMELEON_USE_HIP) + target_link_libraries(chameleon_starpu PUBLIC HIP::HIPBLAS) + endif() endif() # export target diff --git a/runtime/starpu/control/runtime_context.c b/runtime/starpu/control/runtime_context.c index c83ab1283339f214c91c61f7643d75874e67658e..e2cdb399f401872d4712cb7894d350155994c098 100644 --- a/runtime/starpu/control/runtime_context.c +++ b/runtime/starpu/control/runtime_context.c @@ -16,6 +16,7 @@ * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost + * @author Terry Cojean * @date 2022-02-22 * */ diff --git a/runtime/starpu/control/runtime_control.c b/runtime/starpu/control/runtime_control.c index 5a36b8a6f0a2643f148b09b90b900b9a323caa27..d527cd56b307ad2dc033bf7ecc878a26cddff523 100644 --- a/runtime/starpu/control/runtime_control.c +++ b/runtime/starpu/control/runtime_control.c @@ -19,7 +19,9 @@ * @author Philippe Swartvagher * @author Samuel Thibault * @author Matthieu Kuhn - * @date 2022-02-22 + * @author Loris Lucido + * @author Terry Cojean + * @date 2023-01-30 * */ #include "chameleon_starpu.h" @@ -221,8 +223,12 @@ int RUNTIME_init( CHAM_context_t *chamctxt, ); #endif -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) starpu_cublas_init(); +#elif defined(CHAMELEON_USE_HIP) + starpu_hipblas_init(); +#endif #endif starpu_cham_tile_interface_init(); @@ -249,6 +255,9 @@ void RUNTIME_finalize( CHAM_context_t *chamctxt ) #if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) starpu_cublas_shutdown(); #endif +#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION) + starpu_hipblas_shutdown(); +#endif #if defined(CHAMELEON_USE_MPI) starpu_mpi_shutdown(); diff --git a/runtime/starpu/control/runtime_descriptor.c b/runtime/starpu/control/runtime_descriptor.c index 72dd1b81d81b6830edc807e5fdb4a48c9276bd90..385adc0df81a1a87de81971fe3590853b5214287 100644 --- a/runtime/starpu/control/runtime_descriptor.c +++ b/runtime/starpu/control/runtime_descriptor.c @@ -19,7 +19,8 @@ * @author Guillaume Sylvand * @author Raphael Boucherie * @author Samuel Thibault - * @date 2022-02-22 + * @author Loris Lucido + * @date 2023-01-30 * */ #include "chameleon_starpu.h" @@ -138,6 +139,26 @@ void RUNTIME_free( void *ptr, #endif } +#if defined(CHAMELEON_USE_CUDA) + +#define gpuError_t cudaError_t +#define gpuHostRegister cudaHostRegister +#define gpuHostUnregister cudaHostUnregister +#define gpuHostRegisterPortable cudaHostRegisterPortable +#define gpuSuccess cudaSuccess +#define gpuGetErrorString cudaGetErrorString + +#elif defined(CHAMELEON_USE_HIP) + +#define gpuError_t hipError_t +#define gpuHostRegister hipHostRegister +#define gpuHostUnregister hipHostUnregister +#define gpuHostRegisterPortable hipHostRegisterPortable +#define gpuSuccess hipSuccess +#define gpuGetErrorString hipGetErrorString + +#endif + /** * Create data descriptor */ @@ -155,25 +176,27 @@ void RUNTIME_desc_create( CHAM_desc_t *desc ) desc->schedopt = (void*)calloc(lnt*lmt,sizeof(starpu_data_handle_t)); assert(desc->schedopt); -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) /* - * Register allocated memory as CUDA pinned memory + * Register allocated memory as GPU pinned memory */ if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) { int64_t eltsze = CHAMELEON_Element_Size(desc->dtyp); size_t size = (size_t)(desc->llm) * (size_t)(desc->lln) * eltsze; - cudaError_t rc; + gpuError_t rc; /* Register the matrix as pinned memory */ - rc = cudaHostRegister( desc->mat, size, cudaHostRegisterPortable ); - if ( rc != cudaSuccess ) + rc = gpuHostRegister( desc->mat, size, gpuHostRegisterPortable ); + if ( rc != gpuSuccess ) { /* Disable the unregister as register failed */ desc->register_mat = 0; - chameleon_warning("RUNTIME_desc_create(StarPU): cudaHostRegister - ", cudaGetErrorString( rc )); + chameleon_warning("RUNTIME_desc_create(StarPU): gpuHostRegister - ", gpuGetErrorString( rc )); } } +#endif #endif if (desc->ooc) { @@ -247,18 +270,20 @@ void RUNTIME_desc_destroy( CHAM_desc_t *desc ) } } -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) if ( (desc->use_mat == 1) && (desc->register_mat == 1) ) { /* Unmap the pinned memory associated to the matrix */ - if (cudaHostUnregister(desc->mat) != cudaSuccess) + if (gpuHostUnregister(desc->mat) != gpuSuccess) { chameleon_warning("RUNTIME_desc_destroy(StarPU)", - "cudaHostUnregister failed to unregister the " + "gpuHostUnregister failed to unregister the " "pinned memory associated to the matrix"); } } -#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif +#endif free(desc->schedopt); } diff --git a/runtime/starpu/control/runtime_workspace.c b/runtime/starpu/control/runtime_workspace.c index fb209146e3c7e44b4b129379bf23536620322cf9..ad9b5cd9fd0d0abb8f7760b9a66c0936473f1281 100644 --- a/runtime/starpu/control/runtime_workspace.c +++ b/runtime/starpu/control/runtime_workspace.c @@ -15,11 +15,26 @@ * @author Cedric Augonnet * @author Mathieu Faverge * @author Florent Pruvost - * @date 2022-02-22 + * @author Loris Lucido + * @date 2023-01-30 * */ #include "chameleon_starpu.h" +#if defined(CHAMELEON_USE_CUDA) +#define GPU_WORKER_TYPE STARPU_CUDA_WORKER +#define gpuMallocHost cudaMallocHost +#define gpuMalloc cudaMalloc +#define gpuFreeHost cudaFreeHost +#define gpuFree cudaFree +#elif defined(CHAMELEON_USE_HIP) +#define GPU_WORKER_TYPE STARPU_HIP_WORKER +#define gpuMallocHost hipMallocHost +#define gpuMalloc hipMalloc +#define gpuFreeHost hipFreeHost +#define gpuFree hipFree +#endif + static void RUNTIME_allocate_workspace_on_workers(void *arg) { struct chameleon_starpu_ws_s *workspace = arg; @@ -28,9 +43,10 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg) int id = starpu_worker_get_id(); -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) type = starpu_worker_get_type(id); - if (type == STARPU_CUDA_WORKER) + if ( type == GPU_WORKER_TYPE ) { int memory_location = workspace->memory_location; @@ -39,14 +55,15 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg) /* Use pinned memory because the kernel is very likely * to transfer these data between the CPU and the GPU. * */ - cudaMallocHost(&workspace->workspaces[id], workspace->size); + gpuMallocHost( &workspace->workspaces[id], workspace->size ); } else { /* Allocate on the device */ - cudaMalloc(&workspace->workspaces[id], workspace->size); + gpuMalloc( &workspace->workspaces[id], workspace->size ); } } else +#endif #endif { /* This buffer should only be used within the CPU kernel, so @@ -54,7 +71,7 @@ static void RUNTIME_allocate_workspace_on_workers(void *arg) workspace->workspaces[id] = malloc(workspace->size); } - assert(workspace->workspaces[id]); + assert( workspace->workspaces[id] ); } @@ -65,24 +82,26 @@ static void RUNTIME_free_workspace_on_workers(void *arg) (void)type; int id = starpu_worker_get_id(); -#if defined(CHAMELEON_USE_CUDA) && !defined(CHAMELEON_SIMULATION) - type = starpu_worker_get_type(id); - if (type == STARPU_CUDA_WORKER) +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) + type = starpu_worker_get_type( id ); + if ( type == GPU_WORKER_TYPE ) { int memory_location = workspace->memory_location; if (memory_location == CHAMELEON_HOST_MEM) { - cudaFreeHost(workspace->workspaces[id]); + gpuFreeHost( workspace->workspaces[id] ); } else { - cudaFree(workspace->workspaces[id]); + gpuFree( workspace->workspaces[id] ); } } else +#endif #endif { - free(workspace->workspaces[id]); + free( workspace->workspaces[id] ); } workspace->workspaces[id] = NULL; diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index e2d964ccc98ae105c4bd79ceac832f77c8ea89e2..e92e23387f1f20912e3d28f910047c2d9629dfd4 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -17,7 +17,9 @@ * @author Florent Pruvost * @author Philippe Swartvagher * @author Samuel Thibault - * @date 2022-02-22 + * @author Loris Lucido + * @author Terry Cojean + * @date 2023-01-30 * */ #ifndef _chameleon_starpu_h_ @@ -71,6 +73,14 @@ #include <starpu_cublas_v2.h> #endif +#if defined(CHAMELEON_USE_HIP) && !defined(CHAMELEON_SIMULATION) +#include <starpu_scheduler.h> +#include <starpu_hip.h> + +#include <hipblas/hipblas.h> +#include <starpu_hipblas.h> +#endif + #if defined(CHAMELEON_SIMULATION) # if !defined(STARPU_SIMGRID) # error "Starpu was not built with simgrid support (--enable-simgrid). Can not run Chameleon with simulation support." diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 82afe7b42cea6f37c0bd2ede6aeaf0041048fa34..673d8c5d7a6ea287ed2fa4e948c6d8820f35a00a 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -17,7 +17,8 @@ * @author Cedric Castagnede * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @author Loris Lucido + * @date 2023-01-30 * @precisions normal z -> c d s * */ @@ -31,11 +32,17 @@ #if !defined(CHAMELEON_SIMULATION) #include "coreblas/coreblas_z.h" #include "coreblas/coreblas_ztile.h" + #if defined(CHAMELEON_USE_CUDA) #include "cudablas.h" #endif + +#if defined(CHAMELEON_USE_HIP) +#include "hipblas.h" #endif +#endif /* !defined(CHAMELEON_SIMULATION) */ + /* * BLAS 1 functions */ diff --git a/runtime/starpu/include/runtime_codelets.h b/runtime/starpu/include/runtime_codelets.h index d5c923355299277f32747c76f5534bf3edbd5ce0..00808fc52fec20078744d3603bec4cb2be680a24 100644 --- a/runtime/starpu/include/runtime_codelets.h +++ b/runtime/starpu/include/runtime_codelets.h @@ -16,7 +16,8 @@ * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost - * @date 2022-02-22 + * @author Loris Lucido + * @date 2023-01-30 * */ #ifndef _runtime_codelets_h_ @@ -25,14 +26,31 @@ #include "chameleon/config.h" #include "runtime_codelet_profile.h" -//#undef STARPU_CUDA_ASYNC -#ifdef STARPU_CUDA_ASYNC +#if defined(STARPU_CUDA_ASYNC) #define CODELET_CUDA_FLAGS(flags) .cuda_flags = {(flags)}, #else #define CODELET_CUDA_FLAGS(flags) #endif -#define CODELETS_ALL(cl_name, cpu_func_name, cuda_func_name, _original_location_, cuda_flags) \ +#if defined(STARPU_HIP_ASYNC) +#define CODELET_HIP_FLAGS(flags) .hip_flags = {(flags)}, +#else +#define CODELET_HIP_FLAGS(flags) +#endif + +#if defined(CHAMELEON_USE_CUDA) +#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags ) \ + CODELET_CUDA_FLAGS( gpu_flags ) \ + .cuda_func = ((gpu_func_name)), +#elif defined(CHAMELEON_USE_HIP) +#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags ) \ + CODELET_HIP_FLAGS( gpu_flags ) \ + .hip_funcs = {(gpu_func_name)}, +#else +#define CODELET_GPU_FIELDS( gpu_func_name, gpu_flags ) +#endif + +#define CODELETS_ALL(cl_name, cpu_func_name, gpu_func_name, _original_location_, gpu_flags) \ struct starpu_perfmodel cl_##cl_name##_fake = { \ .type = STARPU_HISTORY_BASED, \ .symbol = "fake_"#cl_name \ @@ -46,8 +64,7 @@ struct starpu_codelet cl_##cl_name = { \ .where = (_original_location_), \ .cpu_func = ((cpu_func_name)), \ - CODELET_CUDA_FLAGS(cuda_flags) \ - .cuda_func = ((cuda_func_name)), \ + CODELET_GPU_FIELDS( gpu_func_name, gpu_flags ) \ .nbuffers = STARPU_VARIABLE_NBUFFERS, \ .model = &cl_##cl_name##_model, \ .name = #cl_name \ @@ -77,8 +94,13 @@ CODELETS_ALL( name, cpu_func_name, NULL, STARPU_CPU, 0 ) #endif -#define CODELETS_GPU(name, cpu_func_name, cuda_func_name, cuda_flags) \ - CODELETS_ALL( name, cpu_func_name, cuda_func_name, STARPU_CPU | STARPU_CUDA, cuda_flags ) +#if defined(CHAMELEON_USE_HIP) +#define CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags) \ + CODELETS_ALL( name, cpu_func_name, gpu_func_name, STARPU_CPU | STARPU_HIP, gpu_flags ) +#else +#define CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags) \ + CODELETS_ALL( name, cpu_func_name, gpu_func_name, STARPU_CPU | STARPU_CUDA, gpu_flags ) +#endif #define CODELETS_ALL_HEADER(name) \ CHAMELEON_CL_CB_HEADER(name); \ @@ -89,29 +111,35 @@ void cl_##name##_restore_where(void) #if defined(CHAMELEON_SIMULATION) -#if defined(CHAMELEON_USE_CUDA) -#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \ - CODELETS_GPU(name, (starpu_cpu_func_t) 1, (starpu_cuda_func_t) 1, cuda_flags) + +#if defined(CHAMELEON_USE_CUDA) || defined(CHAMELEON_USE_HIP) +#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \ + CODELETS_GPU(name, (starpu_cpu_func_t) 1, (starpu_cuda_func_t) 1, gpu_flags) #define CODELETS_HEADER(name) CODELETS_ALL_HEADER(name) #else -#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \ +#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \ CODELETS_CPU(name, (starpu_cpu_func_t) 1) #define CODELETS_HEADER(name) CODELETS_ALL_HEADER(name) #endif -#elif defined(CHAMELEON_USE_CUDA) -#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \ - CODELETS_GPU(name, cpu_func_name, cuda_func_name, cuda_flags) + +#else /* defined(CHAMELEON_SIMULATION) */ + +#if defined(CHAMELEON_USE_CUDA) //|| defined(CHAMELEON_USE_HIP) +#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \ + CODELETS_GPU(name, cpu_func_name, gpu_func_name, gpu_flags) #define CODELETS_HEADER(name) CODELETS_ALL_HEADER(name) #else -#define CODELETS(name, cpu_func_name, cuda_func_name, cuda_flags) \ +#define CODELETS(name, cpu_func_name, gpu_func_name, gpu_flags) \ CODELETS_CPU(name, cpu_func_name) #define CODELETS_HEADER(name) CODELETS_ALL_HEADER(name) #endif +#endif + CODELETS_HEADER(map); #endif /* _runtime_codelets_h_ */