diff --git a/gpucublas/include/gpucublas/gpucublas_z.h b/gpucublas/include/gpucublas/gpucublas_z.h index 0773e003598a9a72597355dc71d5f785ec48af70..cf655f41fa47511b06a4c60d11e6397a8b6d9514 100644 --- a/gpucublas/include/gpucublas/gpucublas_z.h +++ b/gpucublas/include/gpucublas/gpucublas_z.h @@ -11,10 +11,10 @@ * * @brief Chameleon GPU CHAMELEON_Complex64_t kernels header * - * @version 1.2.0 + * @version 1.3.0 * @author Florent Pruvost * @author Mathieu Faverge - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -24,6 +24,8 @@ /** * Declarations of cuda kernels - alphabetical order */ +int CUDA_dlag2h( int m, int n, const double *A, int lda, CHAMELEON_Real16_t *B, int ldb, cublasHandle_t handle ); +int CUDA_hlag2d( int m, int n, const CHAMELEON_Real16_t *A, int lda, double *B, int ldb, cublasHandle_t handle ); int CUDA_zgeadd( cham_trans_t trans, int m, int n, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *beta, cuDoubleComplex *B, int ldb, cublasHandle_t handle ); int CUDA_zgemerge( cham_side_t side, cham_diag_t diag, int M, int N, const cuDoubleComplex *A, int LDA, cuDoubleComplex *B, int LDB, cublasHandle_t handle ); int CUDA_zgemm( cham_trans_t transa, cham_trans_t transb, int m, int n, int k, const cuDoubleComplex *alpha, const cuDoubleComplex *A, int lda, const cuDoubleComplex *B, int ldb, const cuDoubleComplex *beta, cuDoubleComplex *C, int ldc, cublasHandle_t handle ); diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index f7c00783b3e6ec08c10edf41ef9530857810c4de..9711e7934f4292929b09493b0c9b373878792264 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -17,7 +17,7 @@ # Univ. of California Berkeley, # Univ. of Colorado Denver. # -# @version 1.2.0 +# @version 1.3.0 # @author Cedric Castagnede # @author Emmanuel Agullo # @author Mathieu Faverge @@ -26,7 +26,7 @@ # @author Matthieu Kuhn # @author Loris Lucido # @author Terry Cojean -# @date 2023-01-30 +# @date 2023-07-06 # ### cmake_minimum_required(VERSION 3.1) @@ -242,6 +242,7 @@ set(RUNTIME_SRCS_GENERATED "") set(ZSRC codelets/codelet_zcallback.c codelets/codelet_zccallback.c + codelets/codelet_dlag2h.c ${CODELETS_ZSRC} ) diff --git a/runtime/starpu/codelets/codelet_dlag2h.c b/runtime/starpu/codelets/codelet_dlag2h.c new file mode 100644 index 0000000000000000000000000000000000000000..b910559db38c20a147e23173c884ea5364d923e1 --- /dev/null +++ b/runtime/starpu/codelets/codelet_dlag2h.c @@ -0,0 +1,161 @@ +/** + * + * @file starpu/codelet_dlag2h.c + * + * @copyright 2009-2014 The University of Tennessee and The University of + * Tennessee Research Foundation. All rights reserved. + * @copyright 2012-2023 Bordeaux INP, CNRS (LaBRI UMR 5800), Inria, + * Univ. Bordeaux. All rights reserved. + * + *** + * + * @brief Chameleon dlag2h StarPU codelet + * + * @version 1.3.0 + * @author Mathieu Faverge + * @author Emmanuel Agullo + * @author Cedric Castagnede + * @author Lucas Barros de Assis + * @author Florent Pruvost + * @author Samuel Thibault + * @date 2023-07-06 + * @precisions normal d -> d s + * + */ +#include "chameleon_starpu.h" +#include "runtime_codelet_d.h" + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_dlag2h_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_dlag2h( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_dlag2h failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( dlag2h, NULL, cl_dlag2h_cuda_func, STARPU_CUDA_ASYNC ) + +/** + * + * @ingroup INSERT_TASK_Complex64_t + * + */ +void INSERT_TASK_dlag2h( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_dlag2h; + void (*callback)(void*) = options->profiling ? cl_dlag2h_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamRealDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamRealHalf, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "dlag2h", +#endif + 0); +} + +#if !defined(CHAMELEON_SIMULATION) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_hlag2d_cuda_func( void *descr[], void *cl_arg ) +{ + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + int m, n; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + + starpu_codelet_unpack_args( cl_arg, &m, &n ); + + int rc = CUDA_hlag2d( + m, n, + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); + + if ( rc != 0 ) { + fprintf( stderr, "core_hlag2d failed with info(%d)\n", rc ); + } +} +#endif /* defined(CHAMELEON_USE_CUDA) */ +#endif /* !defined(CHAMELEON_SIMULATION) */ + +/* + * Codelet definition + */ +CODELETS( hlag2d, NULL, cl_hlag2d_cuda_func, STARPU_CUDA_ASYNC ) + +void INSERT_TASK_hlag2d( const RUNTIME_option_t *options, + int m, int n, int nb, + const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn ) +{ + (void)nb; + struct starpu_codelet *codelet = &cl_hlag2d; + void (*callback)(void*) = options->profiling ? cl_hlag2d_callback : NULL; + + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); + CHAMELEON_END_ACCESS_DECLARATION; + + rt_starpu_insert_task( + codelet, + STARPU_VALUE, &m, sizeof(int), + STARPU_VALUE, &n, sizeof(int), + STARPU_R, RTBLKADDR(A, ChamComplexFloat, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, "hlag2d", +#endif + 0); +} diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 8f05509f70faf0d971c15ecb42aa064738611631..7d1bf87f6c669c18c06d72727d45713be1b86189 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -11,12 +11,12 @@ * * @brief Chameleon zcallback StarPU codelet * - * @version 1.2.0 + * @version 1.3.0 * @author Mathieu Faverge * @author Cedric Augonnet * @author Florent Pruvost * @author Alycia Lisito - * @date 2022-02-22 + * @date 2023-07-06 * @precisions normal z -> c d s * */ @@ -84,3 +84,7 @@ CHAMELEON_CL_CB(ztsmqr_hetra1, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(ztstrf, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), M* M*M) CHAMELEON_CL_CB(zunmlq, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) CHAMELEON_CL_CB(zunmqr, cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), cti_handle_get_m(task->handles[0]), 2. *M* M*M) +#if defined(PRECISION_d) || defined(PRECISION_s) +CHAMELEON_CL_CB(dlag2h, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(hlag2d, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +#endif diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 1147a2b15a602e9907333c8acb26fc53def8daf7..03f2dee935938ebf0202b09bcfe53442950f70e6 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -134,4 +134,9 @@ CODELETS_HEADER(zsytrf_nopiv); #endif CODELETS_HEADER(zplgsy); +#if defined(PRECISION_d) || defined(PRECISION_s) +CODELETS_HEADER(dlag2h); +CODELETS_HEADER(hlag2d); +#endif + #endif /* _runtime_codelet_z_h_ */