diff --git a/runtime/starpu/codelets/codelet_zgersum.c b/runtime/starpu/codelets/codelet_zgersum.c index 26327c982f997473de0e17885a8381a258ecaeb6..b3cfebbd93d958e206b56907cb42bb3ebaec6df5 100644 --- a/runtime/starpu/codelets/codelet_zgersum.c +++ b/runtime/starpu/codelets/codelet_zgersum.c @@ -22,7 +22,8 @@ #include "runtime_codelet_z.h" #if !defined(CHAMELEON_SIMULATION) -static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) +static void +cl_zgersum_redux_cpu_func( void *descr[], void *cl_arg ) { CHAM_tile_t *tileA; CHAM_tile_t *tileB; @@ -38,13 +39,14 @@ static void cl_zgersum_redux_cpu_func(void *descr[], void *cl_arg) return; } -#ifdef CHAMELEON_USE_CUBLAS -static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_redux_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - CHAMELEON_Complex64_t zone = 1.; - CHAM_tile_t *tileA; - CHAM_tile_t *tileB; + cublasHandle_t handle = starpu_cublas_get_local_handle(); + CHAMELEON_Complex64_t zone = 1.; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); @@ -59,17 +61,13 @@ static void cl_zgersum_redux_cuda_func(void *descr[], void *cl_arg) return; } -#endif /* defined(CHAMELEON_USE_CUBLAS) */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -#if defined(CHAMELEON_USE_CUBLAS) -CODELETS(zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC) -#else -CODELETS_CPU(zgersum_redux, cl_zgersum_redux_cpu_func) -#endif +CODELETS( zgersum_redux, cl_zgersum_redux_cpu_func, cl_zgersum_redux_cuda_func, STARPU_CUDA_ASYNC ); #if !defined(CHAMELEON_SIMULATION) static void @@ -83,12 +81,29 @@ cl_zgersum_init_cpu_func( void *descr[], void *cl_arg ) (void)cl_arg; } + +#if defined(CHAMELEON_USE_CUDA) +static void +cl_zgersum_init_cuda_func( void *descr[], void *cl_arg ) +{ + CHAM_tile_t *tileA; + cublasStatus_t rc; + + tileA = cti_interface_get(descr[0]); + + rc = cudaMemset2D( tileA->mat, tileA->ld * sizeof(CHAMELEON_Complex64_t), 0, + tileA->m * sizeof(CHAMELEON_Complex64_t), tileA->n ); + assert( rc == CUBLAS_STATUS_SUCCESS ); + + (void)cl_arg; +} +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU( zgersum_init, cl_zgersum_init_cpu_func ); +CODELETS( zgersum_init, cl_zgersum_init_cpu_func, cl_zgersum_init_cuda_func, STARPU_CUDA_ASYNC ); void RUNTIME_zgersum_set_methods( const CHAM_desc_t *A, int Am, int An ) @@ -108,7 +123,7 @@ RUNTIME_zgersum_submit_tree( const RUNTIME_option_t *options, starpu_mpi_redux_data_prio_tree( MPI_COMM_WORLD, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), options->priority + 1, - 2 /* Arbre binaire */ ); + 2 /* Binary tree */ ); #else (void)options; (void)A;