From 06432c12c7a7ccea3b15900a5bdc4b5d9831fe88 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Mon, 22 May 2023 11:18:18 -0400 Subject: [PATCH] starpu/codelets: Add dlag2h/hlag2d codelets --- gpucublas/include/gpucublas/gpucublas_z.h | 6 +- runtime/starpu/CMakeLists.txt | 5 +- runtime/starpu/codelets/codelet_dlag2h.c | 161 ++++++++++++++++++++ runtime/starpu/codelets/codelet_zcallback.c | 8 +- runtime/starpu/include/runtime_codelet_z.h | 5 + 5 files changed, 179 insertions(+), 6 deletions(-) create mode 100644 runtime/starpu/codelets/codelet_dlag2h.c diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h index 0773e0035..cf655f41f 100644 --- a/gpucublas/include/gpucublas/gpucublas_z.h +++ b/gpucublas/include/gpucublas/gpucublas_z.h @@ -11,10 +11,10 @@ * * @brief Chameleon GPU CHAMELEON_Complex64_t kernels header * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -24,6 +24,8 @@ /** * Declarations of cuda kernels - alphabetical order */ +int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle ); +int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle ); int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index f7c00783b..9711e7934 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -26,7 +26,7 @@ # @author Matthieu Kuhn # @author Loris Lucido # @author Terry Cojean -# @date 2023-01-30 +# @date 2023-07-06 # ### cmake_minimum_required(VERSION 3.1) @@ -242,6 +242,7 @@ set(RUNTIME_SRCS_GENERATED "") set(ZSRC codelets/codelet_zcallback.c codelets/codelet_zccallback.c + codelets/codelet_dlag2h.c ${CODELETS_ZSRC} ) diff --git a/runtime/starpu/codelets/codelet_dlag2h.c b/runtime/starpu/codelets/codelet_dlag2h.c new file mode 100644 index 000000000..b910559db --- /dev/null +++ b/runtime/starpu/codelets/codelet_dlag2h.c @@ -0,0 +1,161 @@ +/** + * + * @file starpu/codelet_dlag2h.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2h StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @author Lucas Barros de Assis + * @author Florent Pruvost + * @author Samuel Thibault + * @date 2023-07-06 + * @precisions normal d -> d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_d.h" + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_dlag2h_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_dlag2h( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_dlag2h failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( dlag2h, NULL, cl_dlag2h_cuda_func, STARPU_CUDA_ASYNC ) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_dlag2h( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_dlag2h; + void (*callback)(void*) = options->profiling ? cl_dlag2h_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamRealDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2h", +#endif + 0); +} + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_hlag2d_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_hlag2d( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_hlag2d failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( hlag2d, NULL, cl_hlag2d_cuda_func, STARPU_CUDA_ASYNC ) + +void INSERT_TASK_hlag2d( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_hlag2d; + void (*callback)(void*) = options->profiling ? cl_hlag2d_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamComplexFloat, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "hlag2d", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 8f05509f7..7d1bf87f6 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -11,12 +11,12 @@ * * @brief Chameleon zcallback StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -84,3 +84,7 @@ CHAMELEON_CL_CB(ztsmqr_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(ztstrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), M* M*M) CHAMELEON_CL_CB(zunmlq, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) CHAMELEON_CL_CB(zunmqr, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) +#if defined(PRECISION_d) || defined(PRECISION_s) +CHAMELEON_CL_CB(dlag2h, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(hlag2d, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +#endif diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 1147a2b15..03f2dee93 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -134,4 +134,9 @@ CODELETS_HEADER(zsytrf_nopiv); #endif CODELETS_HEADER(zplgsy); +#if defined(PRECISION_d) || defined(PRECISION_s) +CODELETS_HEADER(dlag2h); +CODELETS_HEADER(hlag2d); +#endif + #endif /* _runtime_codelet_z_h_ */ -- GitLab