From 967a52a4bc8aded0778244cf999fc0df62113622 Mon Sep 17 00:00:00 2001 From: Antoine JEGO <antoine.jego@etu.enseeiht.fr> Date: Tue, 13 Sep 2022 15:57:54 +0200 Subject: [PATCH 1/2] gersum: Fix the reduction init codelet on GPU --- runtime/starpu/codelets/codelet_zgersum.c | 45 +++++++++++++++-------- 1 file changed, 30 insertions(+), 15 deletions(-) diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c index 26327c982..b3cfebbd9 100644 --- a/runtime/starpu/codelets/codelet_zgersum.c +++ b/runtime/starpu/codelets/codelet_zgersum.c @@ -22,7 +22,8 @@ #include "runtime_codelet_z.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) +static void +cl_zgersum_redux_cpu_func( void *descr[], void *cl_arg ) { CHAM_tile_t *tileA; CHAM_tile_t *tileB; @@ -38,13 +39,14 @@ static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) return; } -#ifdef CHAMELEON_USE_CUBLAS -static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_redux_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - CHAMELEON_Complex64_t zone = 1.; - CHAM_tile_t *tileA; - CHAM_tile_t *tileB; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAMELEON_Complex64_t zone = 1.; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); @@ -59,17 +61,13 @@ static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS) -CODELETS(zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(zgersum_redux, cl_zgersum_redux_cpu_func) -#endif +CODELETS( zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC ); #if !defined(CHAMELEON_SIMULATION) static void @@ -83,12 +81,29 @@ cl_zgersum_init_cpu_func( void *descr[], void *cl_arg ) (void)cl_arg; } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_init_cuda_func( void *descr[], void *cl_arg ) +{ + CHAM_tile_t *tileA; + cublasStatus_t rc; + + tileA = cti_interface_get(descr[0]); + + rc = cudaMemset2D( tileA->mat, tileA->ld * sizeof(CHAMELEON_Complex64_t), 0, + tileA->m * sizeof(CHAMELEON_Complex64_t), tileA->n ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + + (void)cl_arg; +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU( zgersum_init, cl_zgersum_init_cpu_func ); +CODELETS( zgersum_init, cl_zgersum_init_cpu_func, cl_zgersum_init_cuda_func, STARPU_CUDA_ASYNC ); void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) @@ -108,7 +123,7 @@ RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), options->priority + 1, - 2 /* Arbre binaire */ ); + 2 /* Binary tree */ ); #else (void)options; (void)A; -- GitLab From 034f763480e58b0841c3a246ddbc843d53b79ca9 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Wed, 21 Sep 2022 23:03:30 +0200 Subject: [PATCH 2/2] coedelet/geadd: coding style --- runtime/starpu/codelets/codelet_zgeadd.c | 49 ++++++++++++------------ 1 file changed, 24 insertions(+), 25 deletions(-) diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index 825a8fb6c..8c652200e 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -26,39 +26,42 @@ #include "runtime_codelet_z.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) +static void +cl_zgeadd_cpu_func( void *descr[], void *cl_arg ) { - cham_trans_t trans; - int M; - int N; + cham_trans_t trans; + int M; + int N; CHAMELEON_Complex64_t alpha; - CHAM_tile_t *tileA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAM_tile_t *tileB; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta); - TCORE_zgeadd(trans, M, N, alpha, tileA, beta, tileB); + starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); + TCORE_zgeadd( trans, M, N, alpha, tileA, beta, tileB ); + return; } -#ifdef CHAMELEON_USE_CUBLAS -static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgeadd_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_trans_t trans; - int M; - int N; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + cham_trans_t trans; + int M; + int N; cuDoubleComplex alpha; - CHAM_tile_t *tileA; + CHAM_tile_t *tileA; cuDoubleComplex beta; - CHAM_tile_t *tileB; + CHAM_tile_t *tileB; - tileA = cti_interface_get(descr[0]); - tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta ); + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); CUDA_zgeadd( trans, M, N, &alpha, tileA->mat, tileA->ld, @@ -67,17 +70,13 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS) -CODELETS(zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(zgeadd, cl_zgeadd_cpu_func) -#endif +CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC ); void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, -- GitLab