diff --git a/gpucublas/include/CMakeLists.txt b/gpucublas/include/CMakeLists.txt index b208d377e5b85e2549153728182caa53c9ffa2ef..2899bcf14ddcefaeacb1fcf7a48afedad36851cb 100644 --- a/gpucublas/include/CMakeLists.txt +++ b/gpucublas/include/CMakeLists.txt @@ -17,10 +17,10 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Florent Pruvost # @author Mathieu Faverge -# @date 2022-02-22 +# @date 2023-07-06 # ### @@ -29,6 +29,7 @@ set(GPUCUBLAS_HDRS_GENERATED "") set(ZHDR gpucublas/gpucublas_z.h + gpucublas/gpucublas_zc.h ) precisions_rules_py( GPUCUBLAS_HDRS_GENERATED "${ZHDR}" diff --git a/gpucublas/include/gpucublas.h b/gpucublas/include/gpucublas.h index 2305b4f9be07ee8f06d2ce6154fd1d55c0ece938..e44a7e094278010192a979e2b831f57a5ee5a849 100644 --- a/gpucublas/include/gpucublas.h +++ b/gpucublas/include/gpucublas.h @@ -61,6 +61,8 @@ BEGIN_C_DECLS #include "gpucublas/gpucublas_d.h" #include "gpucublas/gpucublas_c.h" #include "gpucublas/gpucublas_s.h" +#include "gpucublas/gpucublas_zc.h" +#include "gpucublas/gpucublas_ds.h" int CUDA_hgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, diff --git a/gpucublas/include/gpucublas/gpucublas_zc.h b/gpucublas/include/gpucublas/gpucublas_zc.h new file mode 100644 index 0000000000000000000000000000000000000000..48f7073da82b211b806d9ccb83961900b7fb6535 --- /dev/null +++ b/gpucublas/include/gpucublas/gpucublas_zc.h @@ -0,0 +1,24 @@ +/** + * + * @file gpucublas_zc.h + * + * @copyright 2023-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon GPU Mixed-precision kernels header + * + * @version 1.3.0 + * @author Mathieu Faverge + * @date 2023-07-06 + * @precisions mixed zc -> zc ds + * + */ +#ifndef _gpucublas_zc_h_ +#define _gpucublas_zc_h_ + +int CUDA_clag2z( int m, int n, const cuFloatComplex *A, int lda, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); +int CUDA_zlag2c( int m, int n, const cuDoubleComplex *A, int lda, cuFloatComplex *B, int ldb, cublasHandle_t handle ); + +#endif /* _gpucublas_zc_h_ */ diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index b6ad117f48741309e6b1ab2e54ac2a827a155982..961cc9c0376c0fa48ce6bbf0fe2b4e183e3714ff 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -11,16 +11,14 @@ * * @brief Chameleon zlag2c StarPU codelet * - * @version 1.2.0 - * @comment This file has been automatically generated - * from Plasma 2.5.0 for CHAMELEON 0.9.2 + * @version 1.3.0 * @author Mathieu Faverge * @author Emmanuel Agullo * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost * @author Samuel Thibault - * @date 2022-02-22 + * @date 2023-07-06 * @precisions mixed zc -> ds * */ @@ -28,7 +26,8 @@ #include "runtime_codelet_zc.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) +static void +cl_zlag2c_cpu_func( void *descr[], void *cl_arg ) { int info = 0; int m; @@ -39,15 +38,44 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); + starpu_codelet_unpack_args( cl_arg, &m, &n ); TCORE_zlag2c( m, n, tileA, tileB, &info ); } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zlag2c_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_zlag2c( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_zlag2c failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(zlag2c, cl_zlag2c_cpu_func) +CODELETS( zlag2c, cl_zlag2c_cpu_func, cl_zlag2c_cuda_func, STARPU_CUDA_ASYNC ) /** * @@ -72,8 +100,8 @@ void INSERT_TASK_zlag2c(const RUNTIME_option_t *options, codelet, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex32_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexFloat, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, @@ -97,17 +125,45 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg) starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_clag2z( m, n, tileA, tileB); } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_clag2z_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_clag2z( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + if ( rc != 0 ) { + fprintf( stderr, "core_clag2z failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU(clag2z, cl_clag2z_cpu_func) +CODELETS( clag2z, cl_clag2z_cpu_func, cl_clag2z_cuda_func, STARPU_CUDA_ASYNC ) -void INSERT_TASK_clag2z(const RUNTIME_option_t *options, - int m, int n, int nb, - const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn) +void INSERT_TASK_clag2z( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) { (void)nb; struct starpu_codelet *codelet = &cl_clag2z; @@ -122,8 +178,8 @@ void INSERT_TASK_clag2z(const RUNTIME_option_t *options, codelet, STARPU_VALUE, &m, sizeof(int), STARPU_VALUE, &n, sizeof(int), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex32_t, Am, An), - STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + STARPU_R, RTBLKADDR(A, ChamComplexFloat, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), STARPU_PRIORITY, options->priority, STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid,