diff --git a/runtime/starpu/codelets/codelet_zgeadd.c b/runtime/starpu/codelets/codelet_zgeadd.c index 825a8fb6c2c59343c0078cc6820149f0e8c27b2b..8c652200e8ba0f3eb2e752a257a6917ac8b989b0 100644 --- a/runtime/starpu/codelets/codelet_zgeadd.c +++ b/runtime/starpu/codelets/codelet_zgeadd.c @@ -26,39 +26,42 @@ #include "runtime_codelet_z.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zgeadd_cpu_func(void *descr[], void *cl_arg) +static void +cl_zgeadd_cpu_func( void *descr[], void *cl_arg ) { - cham_trans_t trans; - int M; - int N; + cham_trans_t trans; + int M; + int N; CHAMELEON_Complex64_t alpha; - CHAM_tile_t *tileA; + CHAM_tile_t *tileA; CHAMELEON_Complex64_t beta; - CHAM_tile_t *tileB; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta); - TCORE_zgeadd(trans, M, N, alpha, tileA, beta, tileB); + starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); + TCORE_zgeadd( trans, M, N, alpha, tileA, beta, tileB ); + return; } -#ifdef CHAMELEON_USE_CUBLAS -static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgeadd_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_trans_t trans; - int M; - int N; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + cham_trans_t trans; + int M; + int N; cuDoubleComplex alpha; - CHAM_tile_t *tileA; + CHAM_tile_t *tileA; cuDoubleComplex beta; - CHAM_tile_t *tileB; + CHAM_tile_t *tileB; - tileA = cti_interface_get(descr[0]); - tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &trans, &M, &N, &alpha, &beta ); + tileA = cti_interface_get( descr[0] ); + tileB = cti_interface_get( descr[1] ); + starpu_codelet_unpack_args( cl_arg, &trans, &M, &N, &alpha, &beta ); CUDA_zgeadd( trans, M, N, &alpha, tileA->mat, tileA->ld, @@ -67,17 +70,13 @@ static void cl_zgeadd_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS) -CODELETS(zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(zgeadd, cl_zgeadd_cpu_func) -#endif +CODELETS( zgeadd, cl_zgeadd_cpu_func, cl_zgeadd_cuda_func, STARPU_CUDA_ASYNC ); void INSERT_TASK_zgeadd( const RUNTIME_option_t *options, cham_trans_t trans, int m, int n, int nb, diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c index 26327c982f997473de0e17885a8381a258ecaeb6..b3cfebbd93d958e206b56907cb42bb3ebaec6df5 100644 --- a/runtime/starpu/codelets/codelet_zgersum.c +++ b/runtime/starpu/codelets/codelet_zgersum.c @@ -22,7 +22,8 @@ #include "runtime_codelet_z.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) +static void +cl_zgersum_redux_cpu_func( void *descr[], void *cl_arg ) { CHAM_tile_t *tileA; CHAM_tile_t *tileB; @@ -38,13 +39,14 @@ static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) return; } -#ifdef CHAMELEON_USE_CUBLAS -static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_redux_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - CHAMELEON_Complex64_t zone = 1.; - CHAM_tile_t *tileA; - CHAM_tile_t *tileB; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAMELEON_Complex64_t zone = 1.; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); @@ -59,17 +61,13 @@ static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS) -CODELETS(zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(zgersum_redux, cl_zgersum_redux_cpu_func) -#endif +CODELETS( zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC ); #if !defined(CHAMELEON_SIMULATION) static void @@ -83,12 +81,29 @@ cl_zgersum_init_cpu_func( void *descr[], void *cl_arg ) (void)cl_arg; } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_init_cuda_func( void *descr[], void *cl_arg ) +{ + CHAM_tile_t *tileA; + cublasStatus_t rc; + + tileA = cti_interface_get(descr[0]); + + rc = cudaMemset2D( tileA->mat, tileA->ld * sizeof(CHAMELEON_Complex64_t), 0, + tileA->m * sizeof(CHAMELEON_Complex64_t), tileA->n ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + + (void)cl_arg; +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU( zgersum_init, cl_zgersum_init_cpu_func ); +CODELETS( zgersum_init, cl_zgersum_init_cpu_func, cl_zgersum_init_cuda_func, STARPU_CUDA_ASYNC ); void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) @@ -108,7 +123,7 @@ RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), options->priority + 1, - 2 /* Arbre binaire */ ); + 2 /* Binary tree */ ); #else (void)options; (void)A;