diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index bc972bcf54562388b6c091049c21d972b79cadf9..bd0823a6f425e600243e83e1e51794d70209f08b 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -22,7 +22,9 @@ * @author Gwenole Lucas * @author Philippe Swartvagher * @author Lucas Nesi - * @date 2022-02-22 + * @author Loris Lucido + * @author Terry Cojean + * @date 2023-01-30 * @precisions normal z -> c d s * */ @@ -61,7 +63,7 @@ cl_zgemm_cpu_func( void *descr[], void *cl_arg ) clargs->beta, tileC ); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_zgemm_cuda_func( void *descr[], void *cl_arg ) { @@ -90,12 +92,48 @@ cl_zgemm_cuda_func( void *descr[], void *cl_arg ) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_zgemm_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + HIP_zgemm( + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (hipDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); + + return; +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zgemm, cl_zgemm_cpu_func, cl_zgemm_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( zgemm, cl_zgemm_cpu_func, cl_zgemm_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_zgemm_Astat( const RUNTIME_option_t *options, cham_trans_t transA, cham_trans_t transB, diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index b7f70ca6868e5a5e8c1543c5ff1c51204be7ec60..cff474bbf6c75e4fdb1108895d6ee76e8a72f0e8 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -57,7 +57,7 @@ cl_zhemm_cpu_func( void *descr[], void *cl_arg ) clargs->beta, tileC ); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_zhemm_cuda_func( void *descr[], void *cl_arg ) { @@ -86,12 +86,46 @@ cl_zhemm_cuda_func( void *descr[], void *cl_arg ) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_zhemm_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + HIP_zhemm( + clargs->side, clargs->uplo, + clargs->m, clargs->n, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (hipDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zhemm, cl_zhemm_cpu_func, cl_zhemm_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, diff --git a/runtime/starpu/codelets/codelet_zher2k.c b/runtime/starpu/codelets/codelet_zher2k.c index 38babbe5a230611d595f4716c170f1a5618ad420..fa102852615bac660a9ea9f40367faa10cfc4f3a 100644 --- a/runtime/starpu/codelets/codelet_zher2k.c +++ b/runtime/starpu/codelets/codelet_zher2k.c @@ -48,7 +48,7 @@ static void cl_zher2k_cpu_func(void *descr[], void *cl_arg) n, k, alpha, tileA, tileB, beta, tileC); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) { cublasHandle_t handle = starpu_cublas_get_local_handle(); @@ -75,13 +75,46 @@ static void cl_zher2k_cuda_func(void *descr[], void *cl_arg) &beta, tileC->mat, tileC->ld, handle ); } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void cl_zher2k_hip_func(void *descr[], void *cl_arg) +{ + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + cham_uplo_t uplo; + cham_trans_t trans; + int n; + int k; + hipDoubleComplex alpha; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + double beta; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); + + HIP_zher2k( uplo, trans, + n, k, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC) +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zher2k, cl_zher2k_cpu_func, cl_zher2k_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( zher2k, cl_zher2k_cpu_func, cl_zher2k_cuda_func, STARPU_CUDA_ASYNC ) +#endif /** * diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 894f0b0269754ba9e0c0171efda6abdaa59b34b0..29301b83097c8b61383c00861777c35917ac27cd 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -74,12 +74,38 @@ cl_zherk_cuda_func(void *descr[], void *cl_arg) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_zherk_hip_func(void *descr[], void *cl_arg) +{ + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); + + HIP_zherk( + clargs->uplo, clargs->trans, clargs->n, clargs->k, + &(clargs->alpha), + tileA->mat, tileA->ld, + &(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zherk, cl_zherk_cpu_func, cl_zherk_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( zherk, cl_zherk_cpu_func, cl_zherk_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_zherk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, diff --git a/runtime/starpu/codelets/codelet_zsymm.c b/runtime/starpu/codelets/codelet_zsymm.c index 9fad20e13bdc75a10cde4bffb998343d71503fdb..54750f0b3f7de2f76083163b9a881db77d8066e3 100644 --- a/runtime/starpu/codelets/codelet_zsymm.c +++ b/runtime/starpu/codelets/codelet_zsymm.c @@ -58,7 +58,7 @@ cl_zsymm_cpu_func( void *descr[], void *cl_arg ) clargs->beta, tileC ); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_zsymm_cuda_func( void *descr[], void *cl_arg ) { @@ -87,12 +87,46 @@ cl_zsymm_cuda_func( void *descr[], void *cl_arg ) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_zsymm_hip_func( void *descr[], void *cl_arg ) +{ + struct cl_zsymm_args_s *clargs = (struct cl_zsymm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + + HIP_zsymm( + clargs->side, clargs->uplo, + clargs->m, clargs->n, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (hipDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zsymm, cl_zsymm_cpu_func, cl_zsymm_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( zsymm, cl_zsymm_cpu_func, cl_zsymm_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_zsymm_Astat( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, diff --git a/runtime/starpu/codelets/codelet_zsyr2k.c b/runtime/starpu/codelets/codelet_zsyr2k.c index af47c87edf546b3a4d3005081c4acff8d61161fe..7d5ce81aadff62dd3866bee1a4ee9ecc63b354a0 100644 --- a/runtime/starpu/codelets/codelet_zsyr2k.c +++ b/runtime/starpu/codelets/codelet_zsyr2k.c @@ -48,7 +48,7 @@ static void cl_zsyr2k_cpu_func(void *descr[], void *cl_arg) n, k, alpha, tileA, tileB, beta, tileC); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) { cublasHandle_t handle = starpu_cublas_get_local_handle(); @@ -75,13 +75,46 @@ static void cl_zsyr2k_cuda_func(void *descr[], void *cl_arg) &beta, tileC->mat, tileC->ld, handle ); } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void cl_zsyr2k_hip_func(void *descr[], void *cl_arg) +{ + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + cham_uplo_t uplo; + cham_trans_t trans; + int n; + int k; + hipDoubleComplex alpha; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + hipDoubleComplex beta; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + tileC = cti_interface_get(descr[2]); + + starpu_codelet_unpack_args(cl_arg, &uplo, &trans, &n, &k, &alpha, &beta); + + HIP_zsyr2k( uplo, trans, + n, k, + &alpha, tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + &beta, tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC) +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_hip_func, STARPU_HIP_ASYNC ) +#else +CODELETS( zsyr2k, cl_zsyr2k_cpu_func, cl_zsyr2k_cuda_func, STARPU_CUDA_ASYNC ) +#endif /** * diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 8d8dcdbe752e256a40ec8c914378dbc6356ba91f..d9b7e84fd00336b445a05eeac648957f61eb51e7 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -75,12 +75,38 @@ cl_zsyrk_cuda_func(void *descr[], void *cl_arg) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_zsyrk_hip_func(void *descr[], void *cl_arg) +{ + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileC; + + tileA = cti_interface_get(descr[0]); + tileC = cti_interface_get(descr[1]); + + HIP_zsyrk( + clargs->uplo, clargs->trans, clargs->n, clargs->k, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + (hipDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( zsyrk, cl_zsyrk_cpu_func, cl_zsyrk_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index e51b81ab7b9b45028b7f63aa00f1643be0a948c6..3bb14d38bef1119024293da7ecdfb6f58a67c12f 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -53,7 +53,7 @@ cl_ztrmm_cpu_func(void *descr[], void *cl_arg) clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { @@ -74,12 +74,38 @@ cl_ztrmm_cuda_func(void *descr[], void *cl_arg) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_ztrmm_hip_func(void *descr[], void *cl_arg) +{ + struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + HIP_ztrmm( + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( ztrmm, cl_ztrmm_cpu_func, cl_ztrmm_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag, diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 0196649684a4652ff5686bc060aad7e7879a1f2b..19e83c01a0010bcd1d495932ab7c67e1ac3dc139 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -20,7 +20,9 @@ * @author Lucas Barros de Assis * @author Florent Pruvost * @author Gwenole Lucas - * @date 2022-02-22 + * @author Loris Lucido + * @author Terry Cojean + * @date 2023-01-30 * @precisions normal z -> c d s * */ @@ -54,7 +56,7 @@ cl_ztrsm_cpu_func(void *descr[], void *cl_arg) clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } -#ifdef CHAMELEON_USE_CUDA +#if defined(CHAMELEON_USE_CUDA) static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { @@ -75,12 +77,40 @@ cl_ztrsm_cuda_func(void *descr[], void *cl_arg) handle ); } #endif /* defined(CHAMELEON_USE_CUDA) */ + +#if defined(CHAMELEON_USE_HIP) +static void +cl_ztrsm_hip_func(void *descr[], void *cl_arg) +{ + struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; + hipblasHandle_t handle = starpu_hipblas_get_local_handle(); + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + HIP_ztrsm( + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (hipDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + handle ); +} +#endif /* defined(CHAMELEON_USE_HIP) */ + #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ + +#if defined(CHAMELEON_USE_HIP) +CODELETS_GPU( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_hip_func, STARPU_HIP_ASYNC ) +#else CODELETS( ztrsm, cl_ztrsm_cpu_func, cl_ztrsm_cuda_func, STARPU_CUDA_ASYNC ) +#endif void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, cham_side_t side, cham_uplo_t uplo, cham_trans_t transA, cham_diag_t diag,