diff --git a/CMakeLists.txt b/CMakeLists.txt index 13b5ea5fa3dc09b606549c1cc0d9289b2a7ccab6..27bb9597f36a7e5ecbf7a686c9241788cca6378f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -30,7 +30,7 @@ # @author Alycia Lisito # @author Loris Lucido # @author Nathan Précigout -# @date 2023-01-30 +# @date 2023-07-04 # ### cmake_minimum_required(VERSION 3.3) @@ -45,7 +45,7 @@ endif() # set project version number set(CHAMELEON_VERSION_MAJOR 1) -set(CHAMELEON_VERSION_MINOR 2) +set(CHAMELEON_VERSION_MINOR 3) set(CHAMELEON_VERSION_MICRO 0) set(CHAMELEON_VERSION "${CHAMELEON_VERSION_MAJOR}.${CHAMELEON_VERSION_MINOR}.${CHAMELEON_VERSION_MICRO}") diff --git a/ChangeLog b/ChangeLog index 3ed3e67b1a16d6fae49130dc78ec8bf325d1a8d5..a3b5120ddae5a526f4f5db287c7a85a5f5e2d9e9 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,8 @@ +chameleon-1.3.0 +------------------------------------------------------------------------ + - types: add support for half precision arithmetic into the data descriptors + - cuda: add half precision conversion kernels, and variants of the gemm kernels (hgemm, and gemmex) + chameleon-1.2.0 ------------------------------------------------------------------------ - NEW: Add support for AMD GPUs throug hipcublas or hip-rocm kernels diff --git a/cmake_modules/local_subs.py b/cmake_modules/local_subs.py index 02af5df38a90e034eebb9438238440267a1a3355..df41dd109b68835ae2f07759d0bb81b758bfe075 100644 --- a/cmake_modules/local_subs.py +++ b/cmake_modules/local_subs.py @@ -6,12 +6,12 @@ @copyright 2019-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, Univ. Bordeaux. All rights reserved. - @version 1.2.0 + @version 1.3.0 @author Mathieu Faverge @author Florent Pruvost @author Nathalie Furmento @author Alycia Lisito - @date 2022-02-22 + @date 2023-07-04 """ _extra_blas = [ @@ -39,6 +39,10 @@ _extra_blas = [ ('', 'slatm1', 'dlatm1', 'slatm1', 'dlatm1' ), ('', 'sgenm2', 'dgenm2', 'cgenm2', 'zgenm2' ), ('', 'slag2c_fake', 'dlag2z_fake', 'slag2c', 'dlag2z' ), + ('', 'slag2h', 'dlag2h', 'slag2h', 'dlag2h' ), + ('', 'hlag2s', 'hlag2d', 'hlag2s', 'hlag2d' ), + ('', 'slag2h', 'dlag2h', 'clag2x', 'zlag2x' ), + ('', 'hlag2s', 'hlag2d', 'xlag2c', 'xlag2z' ), ('', 'sgepdf', 'dgepdf', 'cgepdf', 'zgepdf' ), ('', 'scesca', 'dcesca', 'ccesca', 'zcesca' ), ('', 'sgesum', 'dgesum', 'cgesum', 'zgesum' ), @@ -73,6 +77,9 @@ subs = { ('int', 'float', 'double', 'CHAMELEON_Complex32_t', r'\bCHAMELEON_Complex64_t'), ('ChamPattern', 'ChamRealFloat', 'ChamRealDouble', 'ChamComplexFloat', r'\bChamComplexDouble' ), ('ChamPattern', 'ChamRealFloat', 'ChamRealDouble', 'ChamRealFloat', r'\bChamRealDouble' ), + ('ChamPattern', 'ChamRealFloat', 'ChamRealFloat', 'ChamComplexFloat', r'\bChamComplexFloat' ), + ('ChamPattern', 'ChamRealFloat', 'ChamRealFloat', 'ChamRealFloat', r'\bChamRealFloat' ), + ('ChamPattern', 'ChamRealHalf', 'ChamRealHalf', 'ChamComplexHalf', r'\bChamComplexHalf' ), ('int', 'float', 'double', 'complex32', 'complex64' ), ('Int', 'Float', 'Double', 'Complex32', 'Complex64' ), ('Int', 'HMAT_SIMPLE_PRECISION','HMAT_DOUBLE_PRECISION','HMAT_SIMPLE_COMPLEX', 'HMAT_DOUBLE_COMPLEX'), diff --git a/cmake_modules/morse_cmake b/cmake_modules/morse_cmake index b9e9ec80030a349e1c768f8aec2879d1a6ed28a9..332bf7de07c7eaf85cf370236d5a209b83a58dd7 160000 --- a/cmake_modules/morse_cmake +++ b/cmake_modules/morse_cmake @@ -1 +1 @@ -Subproject commit b9e9ec80030a349e1c768f8aec2879d1a6ed28a9 +Subproject commit 332bf7de07c7eaf85cf370236d5a209b83a58dd7 diff --git a/control/auxiliary.c b/control/auxiliary.c index 1d1b9147fe4e006c6557b07d225fe14130a7c754..d299d8fa8b47f9c5e2eb04f6a66c62c8dbafdc78 100644 --- a/control/auxiliary.c +++ b/control/auxiliary.c @@ -11,7 +11,7 @@ * * @brief Chameleon auxiliary routines * - * @version 1.2.0 + * @version 1.3.0 * @author Jakub Kurzak * @author Piotr Luszczek * @author Emmanuel Agullo @@ -19,7 +19,7 @@ * @author Florent Pruvost * @author Guillaume Sylvand * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-04 * *** * @@ -187,13 +187,17 @@ int CHAMELEON_Version(int *ver_major, int *ver_minor, int *ver_micro) * @retval Element size in bytes * */ -int CHAMELEON_Element_Size(int type) +int CHAMELEON_Element_Size( cham_flttype_t type ) { - switch(type) { + switch( cham_get_flttype(type) ) { case ChamByte: return 1; - case ChamInteger: return sizeof(int); + case ChamInteger16: return sizeof(int16_t); + case ChamInteger32: return sizeof(int32_t); + case ChamInteger64: return sizeof(int64_t); + case ChamRealHalf: return 2; case ChamRealFloat: return sizeof(float); case ChamRealDouble: return sizeof(double); + case ChamComplexHalf: return 4; case ChamComplexFloat: return 2*sizeof(float); case ChamComplexDouble: return 2*sizeof(double); default: chameleon_fatal_error("CHAMELEON_Element_Size", "undefined type"); diff --git a/control/control.c b/control/control.c index 62762469f26f788eb215811a663355b0489ef616..3c9ba6a9259f83f1857449e6f6c429a2ecaa396e 100644 --- a/control/control.c +++ b/control/control.c @@ -11,7 +11,7 @@ * * @brief Chameleon control routines * - * @version 1.2.0 + * @version 1.3.0 * @author Jakub Kurzak * @author Mathieu Faverge * @author Cedric Castagnede @@ -20,7 +20,7 @@ * @author Samuel Thibault * @author Philippe Swartvagher * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-04 * *** * @@ -123,6 +123,7 @@ int __chameleon_initpar(int ncpus, int ngpus, int nthreads_per_worker) } #endif + chamctxt->ncudas = ngpus; return RUNTIME_init( chamctxt, ncpus, ngpus, nthreads_per_worker ); } diff --git a/control/descriptor.c b/control/descriptor.c index 1005a04cff833e3b38bb8134cfa5b47ebef40fd5..c5e4ca2d23ebd0507c752c8b0852ad6edbe1ee82 100644 --- a/control/descriptor.c +++ b/control/descriptor.c @@ -11,14 +11,14 @@ * * @brief Chameleon descriptors routines * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Guillaume Sylvand * @author Raphael Boucherie * @author Samuel Thibault - * @date 2022-12-13 + * @date 2023-07-04 * *** * @@ -91,6 +91,7 @@ int chameleon_desc_mat_free( CHAM_desc_t *desc ) void chameleon_desc_init_tiles( CHAM_desc_t *desc, blkrankof_fct_t rankof ) { CHAM_tile_t *tile; + int8_t flttype = cham_get_flttype( desc->dtyp ); int ii, jj; assert( rankof != chameleon_getrankof_tile ); @@ -101,7 +102,7 @@ void chameleon_desc_init_tiles( CHAM_desc_t *desc, blkrankof_fct_t rankof ) for( ii=0; ii<desc->lmt; ii++, tile++ ) { int rank = rankof( desc, ii, jj ); tile->format = CHAMELEON_TILE_FULLRANK; - tile->flttype = (int8_t)(desc->dtyp); + tile->flttype = flttype; tile->rank = rank; tile->m = ii == desc->lmt-1 ? desc->lm - ii * desc->mb : desc->mb; tile->n = jj == desc->lnt-1 ? desc->ln - jj * desc->nb : desc->nb; @@ -368,6 +369,8 @@ void chameleon_desc_destroy( CHAM_desc_t *desc ) */ int chameleon_desc_check(const CHAM_desc_t *desc) { + cham_flttype_t flttype; + if (desc == NULL) { chameleon_error("chameleon_desc_check", "NULL descriptor"); return CHAMELEON_ERR_NOT_INITIALIZED; @@ -376,11 +379,16 @@ int chameleon_desc_check(const CHAM_desc_t *desc) chameleon_error("chameleon_desc_check", "NULL matrix pointer"); return CHAMELEON_ERR_UNALLOCATED; } - if (desc->dtyp != ChamInteger && - desc->dtyp != ChamRealFloat && - desc->dtyp != ChamRealDouble && - desc->dtyp != ChamComplexFloat && - desc->dtyp != ChamComplexDouble ) { + + flttype = cham_get_flttype( desc->dtyp ); + if ( (flttype != ChamInteger ) && + (flttype != ChamRealHalf ) && + (flttype != ChamRealFloat ) && + (flttype != ChamRealDouble ) && + (flttype != ChamComplexHalf ) && + (flttype != ChamComplexFloat ) && + (flttype != ChamComplexDouble ) ) + { chameleon_error("chameleon_desc_check", "invalid matrix type"); return CHAMELEON_ERR_ILLEGAL_VALUE; } @@ -435,8 +443,11 @@ CHAMELEON_Desc_SubMatrix( CHAM_desc_t *descA, int i, int j, int m, int n ) * * @param[in] dtyp * Data type of the matrix: + * @arg ChamInteger: integer (i), + * @arg ChamRealHalf: half precision real (H), * @arg ChamRealFloat: single precision real (S), * @arg ChamRealDouble: double precision real (D), + * @arg ChamComplexHalf: half precision complex (), * @arg ChamComplexFloat: single precision complex (C), * @arg ChamComplexDouble: double precision complex (Z). * diff --git a/control/descriptor.h b/control/descriptor.h index cff3b76b9af50744245e6cde721b65f78788f125..96bf0bf0bc7f264f44beea28660de358c35a7a90 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -11,7 +11,7 @@ * * @brief Chameleon descriptor header * - * @version 1.2.0 + * @version 1.3.0 * @author Jakub Kurzak * @author Mathieu Faverge * @author Cedric Castagnede @@ -19,7 +19,7 @@ * @author Guillaume Sylvand * @author Raphael Boucherie * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-04 * */ #ifndef _chameleon_descriptor_h_ @@ -76,7 +76,7 @@ int chameleon_desc_check ( const CHAM_desc_t *desc ); /** * Internal function to return address of block (m,n) with m,n = block indices */ -inline static CHAM_tile_t *chameleon_desc_gettile(const CHAM_desc_t *A, int m, int n) +inline static CHAM_tile_t *chameleon_desc_gettile(const CHAM_desc_t *A, int m, int n ) { size_t mm = m + A->i / A->mb; size_t nn = n + A->j / A->nb; diff --git a/gpucublas/compute/CMakeLists.txt b/gpucublas/compute/CMakeLists.txt index d7a745c0489c3364398267f64513737a9348d9a3..5f2987d984f4953d479665b25b494c1d2cf6a213 100644 --- a/gpucublas/compute/CMakeLists.txt +++ b/gpucublas/compute/CMakeLists.txt @@ -17,14 +17,17 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Florent Pruvost # @author Guillaume Sylvand # @author Mathieu Faverge -# @date 2022-02-22 +# @date 2023-07-04 # ### +# To define CMAKE_CUDA_COMPILER +cmake_minimum_required(VERSION 3.18) + # Generate the chameleon sources for all possible precisions # ------------------------------------------------------ set(GPUCUBLAS_SRCS_GENERATED "") @@ -53,6 +56,28 @@ set(ZSRC cuda_zunmqrt.c ) +# Add CUDA kernel if compiler and toolkit are available +# ----------------------------------------------------- +include(CheckLanguage) +check_language(CUDA) + +if(CMAKE_CUDA_COMPILER) + enable_language(CUDA) + find_package(CUDAToolkit) +else() + message(STATUS "CUDA language is not supported") +endif() + +if (CUDAToolkit_FOUND) + include(SetCMakeCudaArchitectures) + + set(ZSRC + ${ZSRC} + cuda_zlag2c.cu + cuda_dlag2h.cu + ) +endif() + # Former MAGMA files that are no longer supported # if( CHAMELEON_USE_MAGMA ) # set(ZSRC @@ -77,9 +102,14 @@ precisions_rules_py( set(GPUCUBLAS_SRCS ${GPUCUBLAS_SRCS_GENERATED} + cuda_hgemm.c + cuda_gemmex.c cudaglobal.c ) +# Need to use CXX compiler to have the __half support and access to cublasHgemm() +set_source_files_properties( cuda_hgemm.c PROPERTIES LANGUAGE CXX ) + # Force generation of sources # --------------------------- add_custom_target(gpucublas_sources ALL SOURCES ${GPUCUBLAS_SRCS}) @@ -102,6 +132,12 @@ set_property(TARGET gpucublas PROPERTY INSTALL_NAME_DIR "${CMAKE_INSTALL_PREFIX} target_link_libraries(gpucublas PRIVATE coreblas CUDA::CUBLAS) target_link_libraries(gpucublas PUBLIC MORSE::M) +set_target_properties(gpucublas PROPERTIES + CUDA_SEPARABLE_COMPILATION OFF) + +#target_include_directories( gpucublas PRIVATE ${CUDAToolkit_INCLUDE_DIRS}) +#target_link_libraries( gpucublas PRIVATE CUDA::cublas CUDA::cudart ) + # export target coreblas install(EXPORT gpucublasTargets NAMESPACE CHAMELEON:: diff --git a/gpucublas/compute/cuda_dlag2h.cu b/gpucublas/compute/cuda_dlag2h.cu new file mode 100644 index 0000000000000000000000000000000000000000..8136c248fd9ff2c02a4910b251f282acdbe4cc77 --- /dev/null +++ b/gpucublas/compute/cuda_dlag2h.cu @@ -0,0 +1,290 @@ +/** + * + * @file cuda_dlag2h.cu + * + * @copyright 2023-2023 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_dlag2h GPU kernel + * + * @version 1.3.0 + * @author Mark Gates + * @author Mathieu Faverge + * @date 2023-07-04 + * @precisions normal d -> d s + * + * This file is an adaptation of the MAGMA zlag2c.cu, clag2z.cu, hlag2s.cu, and slag2h.cu + * + */ +#include "gpucublas.h" +#include <coreblas/lapacke.h> + +#if CUDA_VERSION < 7500 +#error "This file should not be included as the half precision floats are not supported." +#endif + +#define BLK_X 64 +#define BLK_Y 32 + +__device__ int cuda_dlag2h_flag = 0; + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_dlag2h_kernel( + int m, int n, + const double *A, int lda, + CHAMELEON_Real16_t *HA, int ldha, + double rmax ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + double tmp; + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + HA += ind + iby*ldha; + if ( full ) { + /* full block-column */ +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { + tmp = A[j*lda]; + + if ( fabs(tmp) > rmax ) { + cuda_dlag2h_flag = 1; + } + + HA[j*ldha] = __double2half(tmp); + } + } + else { + /* partial block-column */ + for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) { + tmp = A[j*lda]; + if ( fabs(tmp) > rmax ) { + cuda_dlag2h_flag = 1; + } + + HA[j*ldha] = __double2half(tmp); + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_dlag2h converts a double-real matrix, A, to a half-real matrix, + * HA. + * + * RMAX is the overflow for the single-real arithmetic. DLAG2H checks that + * all the entries of A are between -RMAX and RMAX. If not, the conversion is + * aborted and a magma_dlag2h_flag is raised. + * + * @param[in] m + * The number of lines of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] A + * On entry, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. LDA >= max(1,m). + * + * @param[out] HA + * On exit, if INFO=0, the ldha-by-n coefficient matrix HA; + * if INFO > 0, the content of HA is unspecified. + * + * @param[in] ldha + * The leading dimension of the array HA. LDHA >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * - = 1: an entry of the matrix A is greater than the SINGLE PRECISION + * overflow threshold, in this case, the content + * of HA on exit is unspecified. + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_dlag2h( int m, int n, + const double *A, int lda, + CHAMELEON_Real16_t *HA, int ldha, + cublasHandle_t handle ) +{ + cudaStream_t stream; + double rmax; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( lda < chameleon_max(1,m) ) { + return -4; + } + else if ( ldha < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( m == 0 || n == 0 ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + /* + * There is no lapackf77_hlamch, please visit: + * https://blogs.mathworks.com/cleve/2017/05/08/half-precision-16-bit-floating-point-arithmetic/ + */ + rmax = 65504.; + + cublasGetStream( handle, &stream ); + + cuda_dlag2h_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, HA, ldha, rmax ); + + return 0; +} + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_hlag2d_kernel( + int m, int n, + const CHAMELEON_Real16_t *HA, int ldha, + double *A, int lda ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + HA += ind + iby*ldha; + if ( full ) { + // full block-column +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { +#if defined(PRECISION_zc) + A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y ); +#else + A[j*ldha] = HA[j*ldha]; +#endif + } + } + else { + // partial block-column + for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) { +#if defined(PRECISION_zc) + A[j*ldha] = make_double( HA[j*ldha].x, HA[j*ldha].y ); +#else + A[j*ldha] = HA[j*ldha]; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_hlag2d converts a half-real matrix, HA, to a double-real matrix, + * A. + * + * Note that while it is possible to overflow while converting from double to + * single, it is not possible to overflow when converting from single to + * double. + * + * @param[in] m + * The number of lines of the matrix A and HA. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A and HA. n >= 0. + * + * @param[in] HA + * On entry, the lda-by-n coefficient matrix HA. + * + * @param[in] ldha + * The leading dimension of the array HA. ldha >= max(1,m). + * + * @param[out] A + * On exit, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_hlag2d( int m, int n, + const CHAMELEON_Real16_t *HA, int ldha, + double *A, int lda, + cublasHandle_t handle ) +{ + cudaStream_t stream; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( ldha < chameleon_max(1,m) ) { + return -4; + } + else if ( lda < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( (m == 0) || (n == 0) ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + cublasGetStream( handle, &stream ); + cuda_hlag2d_kernel<<< grid, threads, 0, stream >>> ( m, n, HA, ldha, A, lda ); + + return 0; +} diff --git a/gpucublas/compute/cuda_gemmex.c b/gpucublas/compute/cuda_gemmex.c new file mode 100644 index 0000000000000000000000000000000000000000..c384018e900fc88a8939bac3f042c972f0dc367c --- /dev/null +++ b/gpucublas/compute/cuda_gemmex.c @@ -0,0 +1,43 @@ +/** + * + * @file cuda_gemmex.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_gemmex GPU kernel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-04 + * + */ +#include "gpucublas.h" + +int +CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const void *alpha, + const void *A, int lda, cham_flttype_t Atype, + const void *B, int ldb, cham_flttype_t Btype, + const void *beta, + void *C, int ldc, cham_flttype_t Ctype, + cublasHandle_t handle ) +{ + cublasStatus_t rc; + + rc = cublasGemmEx( handle, + chameleon_cublas_const(transa), chameleon_cublas_const(transb), + m, n, k, + CUBLAS_VALUE(alpha), A, lda, chameleon_cublas_dtype( Atype ), + B, ldb, chameleon_cublas_dtype( Btype ), + CUBLAS_VALUE(beta), C, ldc, chameleon_cublas_dtype( Ctype ), + chameleon_cublas_ctype( Ctype ), + CUBLAS_GEMM_DEFAULT ); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; + return CHAMELEON_SUCCESS; +} diff --git a/gpucublas/compute/cuda_hgemm.c b/gpucublas/compute/cuda_hgemm.c new file mode 100644 index 0000000000000000000000000000000000000000..02f465a6a2b8efbc5cfb5551c3266d30736591e7 --- /dev/null +++ b/gpucublas/compute/cuda_hgemm.c @@ -0,0 +1,42 @@ +/** + * + * @file cuda_hgemm.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_hgemm GPU kernel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-04 + * + */ +#include "gpucublas.h" + +extern "C" int +CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const CHAMELEON_Real16_t *alpha, + const CHAMELEON_Real16_t *A, int lda, + const CHAMELEON_Real16_t *B, int ldb, + const CHAMELEON_Real16_t *beta, + CHAMELEON_Real16_t *C, int ldc, + cublasHandle_t handle ) +{ + cublasStatus_t rc; + + rc = cublasHgemm( handle, + (cublasOperation_t)chameleon_cublas_const(transa), + (cublasOperation_t)chameleon_cublas_const(transb), + m, n, k, + CUBLAS_VALUE(alpha), A, lda, + B, ldb, + CUBLAS_VALUE(beta), C, ldc); + + assert( rc == CUBLAS_STATUS_SUCCESS ); + (void)rc; + return CHAMELEON_SUCCESS; +} diff --git a/gpucublas/compute/cuda_zlag2c.cu b/gpucublas/compute/cuda_zlag2c.cu new file mode 100644 index 0000000000000000000000000000000000000000..cf3f35062fbe336c1c8f72119c659ba99e4693da --- /dev/null +++ b/gpucublas/compute/cuda_zlag2c.cu @@ -0,0 +1,304 @@ +/** + * + * @file cuda_zlag2c.cu + * + * @copyright 2023-2023 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon cuda_zlag2c GPU kernel + * + * @version 1.3.0 + * @author Mark Gates + * @author Mathieu Faverge + * @date 2023-07-04 + * @precisions mixed zc -> ds + * + * This file is an adaptation of the MAGMA zlag2c.cu and clag2z files. + * + */ +#include "gpucublas.h" +#include <coreblas/lapacke.h> + +#define BLK_X 64 +#define BLK_Y 32 + +__device__ int cuda_zlag2c_flag = 0; + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_zlag2c_kernel( + int m, int n, + const cuDoubleComplex *A, int lda, + cuFloatComplex *SA, int ldsa, + double rmax ) +{ + cuDoubleComplex tmp; + + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + SA += ind + iby*ldsa; + if ( full ) { + /* full block-column */ +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { + tmp = A[j*lda]; + + if ( +#if defined(PRECISION_zc) + (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax) +#else + (fabs(tmp) > rmax) +#endif + ) + { + cuda_zlag2c_flag = 1; + } + +#if defined(PRECISION_zc) + SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y); +#else + SA[j*ldsa] = tmp; +#endif + } + } + else { + /* partial block-column */ + for( int j=0; (j < BLK_Y) && (iby+j < n); ++j ) { + tmp = A[j*lda]; + if ( +#if defined(PRECISION_zc) + (fabs(tmp.x) > rmax) || (fabs(tmp.y) > rmax) +#else + (fabs(tmp) > rmax) +#endif + ) + { + cuda_zlag2c_flag = 1; + } + +#if defined(PRECISION_zc) + SA[j*ldsa] = make_cuFloatComplex(tmp.x, tmp.y); +#else + SA[j*ldsa] = tmp; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_zlag2c converts a double-complex matrix, A, to a single-complex matrix, + * SA. + * + * RMAX is the overflow for the single-complex arithmetic. ZLAG2C checks that + * all the entries of A are between -RMAX and RMAX. If not, the conversion is + * aborted and a magma_zlag2c_flag is raised. + * + * @param[in] m + * The number of lines of the matrix A. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A. n >= 0. + * + * @param[in] A + * On entry, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. LDA >= max(1,m). + * + * @param[out] SA + * On exit, if INFO=0, the ldsa-by-n coefficient matrix SA; + * if INFO > 0, the content of SA is unspecified. + * + * @param[in] ldsa + * The leading dimension of the array SA. LDSA >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * - = 1: an entry of the matrix A is greater than the COMPLEX + * overflow threshold, in this case, the content + * of SA on exit is unspecified. + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_zlag2c( int m, int n, + const cuDoubleComplex *A, int lda, + cuFloatComplex *SA, int ldsa, + cublasHandle_t handle ) +{ + cudaStream_t stream; + double rmax; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( lda < chameleon_max(1,m) ) { + return -4; + } + else if ( ldsa < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( m == 0 || n == 0 ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + rmax = LAPACKE_slamch_work( 'O' ); + cublasGetStream( handle, &stream ); + + cuda_zlag2c_kernel<<< grid, threads, 0, stream >>>( m, n, A, lda, SA, ldsa, rmax ); + + return 0; +} + +/* + * Divides matrix into ceil( m/BLK_X ) x ceil( n/BLK_Y ) blocks. + * Each block has BLK_X threads. + * Each thread loops across one row, updating BLK_Y entries. + */ +__global__ +void cuda_clag2z_kernel( + int m, int n, + const cuFloatComplex *SA, int ldsa, + cuDoubleComplex *A, int lda ) +{ + int ind = blockIdx.x * BLK_X + threadIdx.x; + int iby = blockIdx.y * BLK_Y; + + /* check if full block-column */ + bool full = (iby + BLK_Y <= n); + + /* do only rows inside matrix */ + if ( ind > m ) { + return; + } + + A += ind + iby*lda; + SA += ind + iby*ldsa; + if ( full ) { + // full block-column +#pragma unroll + for( int j=0; j < BLK_Y; ++j ) { +#if defined(PRECISION_zc) + A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y ); +#else + A[j*ldsa] = SA[j*ldsa]; +#endif + } + } + else { + // partial block-column + for( int j=0; (j < BLK_Y) && (iby+j) < n; ++j ) { +#if defined(PRECISION_zc) + A[j*ldsa] = make_cuDoubleComplex( SA[j*ldsa].x, SA[j*ldsa].y ); +#else + A[j*ldsa] = SA[j*ldsa]; +#endif + } + } +} + + +/** + * + * @ingroup CUDA_CHAMELEON_Complex64_t + * + * CUDA_clag2z converts a single-complex matrix, SA, to a double-complex matrix, + * A. + * + * Note that while it is possible to overflow while converting from double to + * single, it is not possible to overflow when converting from single to + * double. + * + * @param[in] m + * The number of lines of the matrix A and SA. m >= 0. + * + * @param[in] n + * The number of columns of the matrix A and SA. n >= 0. + * + * @param[in] SA + * On entry, the lda-by-n coefficient matrix SA. + * + * @param[in] ldsa + * The leading dimension of the array SA. ldsa >= max(1,m). + * + * @param[out] A + * On exit, the lda-by-n coefficient matrix A. + * + * @param[in] lda + * The leading dimension of the array A. lda >= max(1,m). + * + * @param[out] + * - = 0: successful exit. + * - < 0: if INFO = -i, the i-th argument had an illegal value + * + * @param[in] handle + * Cublas handle to execute in. + * + **/ +extern "C" int +CUDA_clag2z( int m, int n, + const cuFloatComplex *SA, int ldsa, + cuDoubleComplex *A, int lda, + cublasHandle_t handle ) +{ + cudaStream_t stream; + + if ( m < 0 ) { + return -1; + } + else if ( n < 0 ) { + return -2; + } + else if ( ldsa < chameleon_max(1,m) ) { + return -4; + } + else if ( lda < chameleon_max(1,m) ) { + return -6; + } + + /* quick return */ + if ( (m == 0) || (n == 0) ) { + return 0; + } + + dim3 threads( BLK_X, 1 ); + dim3 grid( chameleon_ceil( m, BLK_X ), chameleon_ceil( n, BLK_Y ) ); + + cublasGetStream( handle, &stream ); + cuda_clag2z_kernel<<< grid, threads, 0, stream >>> ( m, n, SA, ldsa, A, lda ); + + return 0; +} diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h index 8a9deb977ccae2c998f3b1390c5085febaee9104..8e7d4c3afa4d0165e2961c7791ae2dd2f12b3871 100644 --- a/gpucublas/include/gpucublas.h +++ b/gpucublas/include/gpucublas.h @@ -11,11 +11,11 @@ * * @brief Chameleon GPU kernels main header * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge * @author Nathalie Furmento - * @date 2022-02-22 + * @date 2023-07-04 * @precisions normal z -> c d s * */ @@ -35,6 +35,9 @@ #include <cuda.h> #include <cuComplex.h> +#if CUDA_VERSION >= 7500 +#include <cuda_fp16.h> +#endif #include <cublas_v2.h> @@ -58,6 +61,54 @@ BEGIN_C_DECLS #include "gpucublas/gpucublas_c.h" #include "gpucublas/gpucublas_s.h" +int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const CHAMELEON_Real16_t *alpha, + const CHAMELEON_Real16_t *A, int lda, + const CHAMELEON_Real16_t *B, int ldb, + const CHAMELEON_Real16_t *beta, + CHAMELEON_Real16_t *C, int ldc, + cublasHandle_t handle ); + +int CUDA_gemmex( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const void *alpha, + const void *A, int lda, cham_flttype_t Atype, + const void *B, int ldb, cham_flttype_t Btype, + const void *beta, + void *C, int ldc, cham_flttype_t Ctype, + cublasHandle_t handle ); + +static inline cublasComputeType_t +chameleon_cublas_ctype( cham_flttype_t flttype ) { + + switch ( flttype ) { + case ChamRealHalf : return CUBLAS_COMPUTE_16F; + case ChamRealFloat : return CUBLAS_COMPUTE_32F; + case ChamRealDouble : return CUBLAS_COMPUTE_64F; + case ChamComplexFloat : return CUBLAS_COMPUTE_32F; + case ChamComplexDouble : return CUBLAS_COMPUTE_64F; + default: + fprintf( stderr, "chameleon_cublas_ctype(): Incorrect flttype\n" ); + exit(1); + } +} + +static inline cudaDataType_t +chameleon_cublas_dtype( cham_flttype_t flttype ) { + + switch ( flttype ) { + case ChamRealHalf : return CUDA_R_16F; + case ChamRealFloat : return CUDA_R_32F; + case ChamRealDouble : return CUDA_R_64F; + case ChamComplexFloat : return CUDA_C_32F; + case ChamComplexDouble : return CUDA_C_64F; + default: + fprintf( stderr, "chameleon_cublas_dtype(): Incorrect flttype\n" ); + exit(1); + } +} + END_C_DECLS /** diff --git a/gpuhipblas/compute/CMakeLists.txt b/gpuhipblas/compute/CMakeLists.txt index bd7ac4540ccad44d08140014a6af1af43df248b5..cd3db9e9e0d0f951d03d331ed14bb8c0840cfcc5 100644 --- a/gpuhipblas/compute/CMakeLists.txt +++ b/gpuhipblas/compute/CMakeLists.txt @@ -17,12 +17,12 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Florent Pruvost # @author Guillaume Sylvand # @author Mathieu Faverge # @author Loris Lucido -# @date 2023-01-30 +# @date 2023-07-04 # ### @@ -47,6 +47,7 @@ precisions_rules_py( set(GPUHIPBLAS_SRCS ${GPUHIPBLAS_SRCS_GENERATED} + hip_hgemm.c hipglobal.c ) diff --git a/gpuhipblas/compute/hip_hgemm.c b/gpuhipblas/compute/hip_hgemm.c new file mode 100644 index 0000000000000000000000000000000000000000..12d7feac2bf453a72614e7a5271898f51b461879 --- /dev/null +++ b/gpuhipblas/compute/hip_hgemm.c @@ -0,0 +1,41 @@ +/** + * + * @file hip_hgemm.c + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon hip_hgemm GPU kernel + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-04 + * + */ +#include "gpuhipblas.h" + +int +HIP_hgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const CHAMELEON_Real16_t *alpha, + const CHAMELEON_Real16_t *A, int lda, + const CHAMELEON_Real16_t *B, int ldb, + const CHAMELEON_Real16_t *beta, + CHAMELEON_Real16_t *C, int ldc, + hipblasHandle_t handle ) +{ + hipblasStatus_t rc; + + rc = hipblasHgemm( handle, + chameleon_hipblas_const(transa), chameleon_hipblas_const(transb), + m, n, k, + HIPBLAS_VALUE(alpha), A, lda, + B, ldb, + HIPBLAS_VALUE(beta), C, ldc ); + + assert( rc == HIPBLAS_STATUS_SUCCESS ); + (void)rc; + return CHAMELEON_SUCCESS; +} diff --git a/gpuhipblas/include/gpuhipblas.h b/gpuhipblas/include/gpuhipblas.h index 572abcf9677fa3caf1e16f8c6a68a64f6e202cb9..705084958bc28c6c66f8d062f675d96cee90bb9d 100644 --- a/gpuhipblas/include/gpuhipblas.h +++ b/gpuhipblas/include/gpuhipblas.h @@ -11,12 +11,12 @@ * * @brief Chameleon GPU kernels main header * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge * @author Nathalie Furmento * @author Loris Lucido - * @date 2023-01-30 + * @date 2023-07-04 * @precisions normal z -> c d s * */ @@ -36,6 +36,7 @@ #include <hip/hip_runtime.h> #include <hip/hip_complex.h> +#include <hip/hip_fp16.h> #include <hipblas/hipblas.h> @@ -59,6 +60,15 @@ BEGIN_C_DECLS #include "gpuhipblas/gpuhipblas_c.h" #include "gpuhipblas/gpuhipblas_s.h" +int HIP_hgemm( cham_trans_t transa, cham_trans_t transb, + int m, int n, int k, + const CHAMELEON_Real16_t *alpha, + const CHAMELEON_Real16_t *A, int lda, + const CHAMELEON_Real16_t *B, int ldb, + const CHAMELEON_Real16_t *beta, + CHAMELEON_Real16_t *C, int ldc, + hipblasHandle_t handle ); + END_C_DECLS /** diff --git a/include/chameleon.h b/include/chameleon.h index 77b6544f0af5e0e599baf8575cd8755b5e6c5f27..b6ff6913c9f74c6960d4615bfeaf82ec878001c8 100644 --- a/include/chameleon.h +++ b/include/chameleon.h @@ -11,13 +11,13 @@ * * @brief Chameleon main header * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Cedric Castagnede * @author Florent Pruvost * @author Philippe Virouleau - * @date 2022-02-22 + * @date 2023-07-04 * */ #ifndef _chameleon_h_ @@ -120,7 +120,7 @@ int CHAMELEON_Lapack_to_Tile( void *Af77, int LDA, CHAM_desc_t *A ) __attribute_ int CHAMELEON_Tile_to_Lapack( CHAM_desc_t *A, void *Af77, int LDA ) __attribute__((deprecated("Please refer to CHAMELEON_Desc2Lap() instead"))); /* Descriptor */ -int CHAMELEON_Element_Size(int type); +int CHAMELEON_Element_Size( cham_flttype_t type ); int CHAMELEON_Desc_Create_User( CHAM_desc_t **desc, void *mat, cham_flttype_t dtyp, int mb, int nb, int bsiz, int lm, int ln, int i, int j, int m, int n, int p, int q, diff --git a/include/chameleon/constants.h b/include/chameleon/constants.h index cff1f56529ce56ce1ddf52e81bac66e8c5f219d2..fe50552f405fdef0e5e985d3ebcdcdd39e49b4d5 100644 --- a/include/chameleon/constants.h +++ b/include/chameleon/constants.h @@ -11,14 +11,14 @@ * * @brief Chameleon global constants * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Alycia Lisito * @author Terry Cojean - * @date 2022-02-22 + * @date 2023-07-04 * */ #ifndef _chameleon_constants_h_ @@ -36,15 +36,74 @@ /** * @brief Matrix floating point arithmetic */ +typedef enum chameleon_arithmetic_e { + Cham8 = 0, + ChamHalf = 1, + ChamSingle = 2, + ChamDouble = 3, +} cham_arithmetic_t; + +#define CHAM_ARITHMETIC_MASK 0b11 + +typedef enum chameleon_ftype_e { + ChamInt = 0, + ChamReal = 1, + ChamComplex = 2, +} cham_ftype_t; + +#define CHAM_FTYPE_MASK 0b1100 +#define CHAM_MIXED_MASK 0b10000 + +#define cham_get_arith( _ftype_ ) ( (_ftype_) & CHAM_ARITHMETIC_MASK ) +#define cham_get_ftype( _ftype_ ) (( (_ftype_) & CHAM_FTYPE_MASK ) >> 2 ) +#define cham_get_flttype( _ftype_ ) ( (_ftype_) & (CHAM_FTYPE_MASK | CHAM_ARITHMETIC_MASK) ) +#define cham_is_mixed( _ftype_ ) ( (_ftype_) & CHAM_MIXED_MASK ) + +#define cham_clean_mixed( _ftype_ ) ( (_ftype_) & ~CHAM_MIXED_MASK ) + +#define CHAMELEON_FLTTYPE( _ftype_, _arithmetic_ ) ( ((_ftype_) << 2) | (_arithmetic_) ) + typedef enum chameleon_flttype_e { - ChamByte = 0, - ChamInteger = 1, - ChamRealFloat = 2, - ChamRealDouble = 3, - ChamComplexFloat = 4, - ChamComplexDouble = 5, + ChamByte = CHAMELEON_FLTTYPE( ChamInt, Cham8 ), + ChamInteger16 = CHAMELEON_FLTTYPE( ChamInt, ChamHalf ), + ChamInteger = CHAMELEON_FLTTYPE( ChamInt, ChamSingle ), + ChamInteger32 = CHAMELEON_FLTTYPE( ChamInt, ChamSingle ), + ChamInteger64 = CHAMELEON_FLTTYPE( ChamInt, ChamDouble ), + ChamRealHalf = CHAMELEON_FLTTYPE( ChamReal, ChamHalf ), + ChamRealFloat = CHAMELEON_FLTTYPE( ChamReal, ChamSingle ), + ChamRealDouble = CHAMELEON_FLTTYPE( ChamReal, ChamDouble ), + ChamComplexHalf = CHAMELEON_FLTTYPE( ChamComplex, ChamHalf ), + ChamComplexFloat = CHAMELEON_FLTTYPE( ChamComplex, ChamSingle ), + ChamComplexDouble = CHAMELEON_FLTTYPE( ChamComplex, ChamDouble ), + ChamRealDoubleMixed = ChamRealDouble | CHAM_MIXED_MASK, + ChamComplexDoubleMixed = ChamComplexDouble | CHAM_MIXED_MASK, } cham_flttype_t; +#define ChamComplexSingle ChamComplexFloat +#define ChamRealSingle ChamRealFloat + +#define ChamConvert( in, out ) ( cham_clean_mixed(in) | (cham_clean_mixed(out) << 5) ) + +#define ChamConvertComplexDoubleToDouble ChamConvert( ChamComplexDouble, ChamComplexDouble ) +#define ChamConvertComplexDoubleToSingle ChamConvert( ChamComplexDouble, ChamComplexSingle ) +#define ChamConvertComplexDoubleToHalf ChamConvert( ChamComplexDouble, ChamComplexHalf ) +#define ChamConvertComplexSingleToDouble ChamConvert( ChamComplexSingle, ChamComplexDouble ) +#define ChamConvertComplexSingleToSingle ChamConvert( ChamComplexSingle, ChamComplexSingle ) +#define ChamConvertComplexSingleToHalf ChamConvert( ChamComplexSingle, ChamComplexHalf ) +#define ChamConvertComplexHalfToDouble ChamConvert( ChamComplexHalf, ChamComplexDouble ) +#define ChamConvertComplexHalfToSingle ChamConvert( ChamComplexHalf, ChamComplexSingle ) +#define ChamConvertComplexHalfToHalf ChamConvert( ChamComplexHalf, ChamComplexHalf ) + +#define ChamConvertRealDoubleToDouble ChamConvert( ChamRealDouble, ChamRealDouble ) +#define ChamConvertRealDoubleToSingle ChamConvert( ChamRealDouble, ChamRealSingle ) +#define ChamConvertRealDoubleToHalf ChamConvert( ChamRealDouble, ChamRealHalf ) +#define ChamConvertRealSingleToDouble ChamConvert( ChamRealSingle, ChamRealDouble ) +#define ChamConvertRealSingleToSingle ChamConvert( ChamRealSingle, ChamRealSingle ) +#define ChamConvertRealSingleToHalf ChamConvert( ChamRealSingle, ChamRealHalf ) +#define ChamConvertRealHalfToDouble ChamConvert( ChamRealHalf, ChamRealDouble ) +#define ChamConvertRealHalfToSingle ChamConvert( ChamRealHalf, ChamRealSingle ) +#define ChamConvertRealHalfToHalf ChamConvert( ChamRealHalf, ChamRealHalf ) + /** * @brief Matrix tile storage */ diff --git a/include/chameleon/runtime_struct.h b/include/chameleon/runtime_struct.h index 59a2ec3cca1b2b92cf8100bca84297f0806afa1d..4b172782507b5b43e17fb4f3672b6aebf13a7c9d 100644 --- a/include/chameleon/runtime_struct.h +++ b/include/chameleon/runtime_struct.h @@ -11,13 +11,13 @@ * * @brief Runtime structures * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Philippe Virouleau - * @date 2022-02-22 + * @date 2023-07-04 * */ #ifndef _chameleon_runtime_struct_h_ @@ -85,6 +85,7 @@ typedef struct runtime_option_s { int priority; /**< Define the submitted task priority */ int workerid; /**< Define the prefered worker id to perform the tasks */ int forcesub; /**< Force task submission if true */ + int withcuda; /**< Flag to know if cuda is enabled/disabled */ size_t ws_wsize; /**< Define the worker workspace size */ size_t ws_hsize; /**< Define the host workspace size for hybrid CPU/GPU kernel */ void *ws_worker; /**< Pointer to the worker workspace (structure) */ diff --git a/include/chameleon/types.h b/include/chameleon/types.h index a174db70eda2f8dca2b1e38e7aae4726a31a8fad..4d46a65fe4733414806d2589d6dc26a80a3b8e55 100644 --- a/include/chameleon/types.h +++ b/include/chameleon/types.h @@ -11,14 +11,14 @@ * * @brief Chameleon basic datatypes header * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost * @author Lucas Barros de Assis * @author Thomas Mijieux - * @date 2022-02-22 + * @date 2023-07-04 * */ #ifndef _chameleon_types_h_ @@ -104,6 +104,17 @@ typedef int8_t cham_bool_t; #endif #endif /* CHAMELEON_COMPLEX_CPP */ +/** + * Half precision on GPUs + */ +#if defined(__cplusplus) +typedef __half CHAMELEON_Real16_t; +#else +/* use short for cuda older than 7.5 and non-cuda files + * corresponding routines would not work anyway since there is no half precision */ +typedef short CHAMELEON_Real16_t; +#endif + /** * CHAMELEON Deprecated attribute */ diff --git a/runtime/starpu/control/runtime_options.c b/runtime/starpu/control/runtime_options.c index 6ade7b962cf17ec170c6e35465086b57583e7bdd..c70f1aa74517beb2edd41d30ebdafb52adc9a0d0 100644 --- a/runtime/starpu/control/runtime_options.c +++ b/runtime/starpu/control/runtime_options.c @@ -11,12 +11,12 @@ * * @brief Chameleon StarPU options routines * - * @version 1.2.0 + * @version 1.3.0 * @author Cedric Augonnet * @author Mathieu Faverge * @author Cedric Castagnede * @author Florent Pruvost - * @date 2022-02-22 + * @date 2023-07-04 * */ #include "chameleon_starpu.h" @@ -32,6 +32,7 @@ void RUNTIME_options_init( RUNTIME_option_t *options, CHAM_context_t *chamctxt, options->priority = RUNTIME_PRIORITY_MIN; options->workerid = (schedopt == NULL) ? -1 : schedopt->workerid; options->forcesub = 0; + options->withcuda = (chamctxt->ncudas > 0); options->ws_wsize = 0; options->ws_hsize = 0; options->ws_worker = NULL;