diff --git a/include/chameleon/tasks_z.h b/include/chameleon/tasks_z.h index 0403d1c8d6b898ba41eb6c57bb423b3dc65c28fa..4576ba9e44671fa4964b988d9f9a73e6bb9df312 100644 --- a/include/chameleon/tasks_z.h +++ b/include/chameleon/tasks_z.h @@ -110,6 +110,12 @@ void INSERT_TASK_zhemm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn, CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); +void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ); void INSERT_TASK_zher2k( const RUNTIME_option_t *options, cham_uplo_t uplo, cham_trans_t trans, int n, int k, int nb, diff --git a/runtime/openmp/codelets/codelet_zhemm.c b/runtime/openmp/codelets/codelet_zhemm.c index 79d36ff56f9c76074225a9b0438631236eab3b39..2d36c052c02632b2363f2ba6078e46839e4b84ea 100644 --- a/runtime/openmp/codelets/codelet_zhemm.c +++ b/runtime/openmp/codelets/codelet_zhemm.c @@ -20,23 +20,37 @@ #include "chameleon/tasks_z.h" #include "coreblas/coreblas_ztile.h" -void INSERT_TASK_zhemm( const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); CHAM_tile_t *tileC = C->get_blktile( C, Cm, Cn ); #pragma omp task firstprivate( side, uplo, m, n, alpha, tileA, tileB, beta, tileC ) depend( in:tileA[0], tileB[0] ) depend( inout:tileC[0] ) TCORE_zhemm( side, uplo, - m, n, - alpha, tileA, - tileB, - beta, tileC ); + m, n, + alpha, tileA, + tileB, + beta, tileC ); (void)options; (void)nb; } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/parsec/codelets/codelet_zhemm.c b/runtime/parsec/codelets/codelet_zhemm.c index a1c398308c514eaaea41be192c061bc42b09ac09..62ecccc26161921e93e832c6773a83284f6d76dc 100644 --- a/runtime/parsec/codelets/codelet_zhemm.c +++ b/runtime/parsec/codelets/codelet_zhemm.c @@ -43,20 +43,21 @@ CORE_zhemm_parsec( parsec_execution_stream_t *context, this_task, &side, &uplo, &M, &N, &alpha, &A, &LDA, &B, &LDB, &beta, &C, &LDC ); CORE_zhemm( side, uplo, M, N, - alpha, A, LDA, - B, LDB, - beta, C, LDC); + alpha, A, LDA, + B, LDB, + beta, C, LDC ); (void)context; return PARSEC_HOOK_RETURN_DONE; } -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { parsec_taskpool_t* PARSEC_dtd_taskpool = (parsec_taskpool_t *)(options->sequence->schedopt); CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); @@ -81,3 +82,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, (void)nb; } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/quark/codelets/codelet_zhemm.c b/runtime/quark/codelets/codelet_zhemm.c index 9ab119755135f795ebc39031bd08795f5a59b07b..917376cedfdea194ae2c789808e78a92dad1f610 100644 --- a/runtime/quark/codelets/codelet_zhemm.c +++ b/runtime/quark/codelets/codelet_zhemm.c @@ -30,28 +30,28 @@ void CORE_zhemm_quark(Quark *quark) { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; - quark_unpack_args_9(quark, side, uplo, M, N, alpha, tileA, tileB, beta, tileC); - TCORE_zhemm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + quark_unpack_args_9(quark, side, uplo, m, n, alpha, tileA, tileB, beta, tileC); + TCORE_zhemm( side, uplo, + m, n, + alpha, tileA, tileB, + beta, tileC ); } -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void +INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, @@ -74,3 +74,16 @@ void INSERT_TASK_zhemm(const RUNTIME_option_t *options, sizeof(void*), RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), accessC, 0); } + +void +INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + INSERT_TASK_zhemm( options, side, uplo, m, n, nb, + alpha, A, Am, An, B, Bm, Bn, + beta, C, Cm, Cn ); +} diff --git a/runtime/starpu/codelets/codelet_zhemm.c b/runtime/starpu/codelets/codelet_zhemm.c index c05dbb92d53962d337b995f3725f02aeae19e31f..4451431f4c075feca2d6d6f6170098be3cf1d3cc 100644 --- a/runtime/starpu/codelets/codelet_zhemm.c +++ b/runtime/starpu/codelets/codelet_zhemm.c @@ -26,111 +26,229 @@ #include "chameleon_starpu.h" #include "runtime_codelet_z.h" -#if !defined(CHAMELEON_SIMULATION) -static void cl_zhemm_cpu_func(void *descr[], void *cl_arg) -{ +struct cl_zhemm_args_s { cham_side_t side; cham_uplo_t uplo; - int M; - int N; + int m; + int n; CHAMELEON_Complex64_t alpha; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAMELEON_Complex64_t beta; CHAM_tile_t *tileC; +}; + +#if !defined(CHAMELEON_SIMULATION) +static void +cl_zhemm_cpu_func( void *descr[], void *cl_arg ) +{ + struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); - TCORE_zhemm(side, uplo, - M, N, - alpha, tileA, - tileB, - beta, tileC); + TCORE_zhemm( clargs->side, clargs->uplo, + clargs->m, clargs->n, + clargs->alpha, tileA, tileB, + clargs->beta, tileC ); } #ifdef CHAMELEON_USE_CUDA -static void cl_zhemm_cuda_func(void *descr[], void *cl_arg) +static void +cl_zhemm_cuda_func( void *descr[], void *cl_arg ) { - cublasHandle_t handle = starpu_cublas_get_local_handle(); - cham_side_t side; - cham_uplo_t uplo; - int M; - int N; - cuDoubleComplex alpha; + struct cl_zhemm_args_s *clargs = (struct cl_zhemm_args_s *)cl_arg; + cublasHandle_t handle = starpu_cublas_get_local_handle(); CHAM_tile_t *tileA; CHAM_tile_t *tileB; - cuDoubleComplex beta; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &side, &uplo, &M, &N, &alpha, &beta); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); CUDA_zhemm( - side, uplo, - M, N, - &alpha, tileA->mat, tileA->ld, - tileB->mat, tileB->ld, - &beta, tileC->mat, tileC->ld, + clargs->side, clargs->uplo, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), + tileA->mat, tileA->ld, + tileB->mat, tileB->ld, + (cuDoubleComplex*)&(clargs->beta), + tileC->mat, tileC->ld, handle ); } -#endif /* CHAMELEON_USE_CUDA */ +#endif /* defined(CHAMELEON_USE_CUDA) */ #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS(zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC) +CODELETS( zhemm, cl_zhemm_cpu_func, cl_zhemm_cuda_func, STARPU_CUDA_ASYNC ) -/** - * - * @ingroup INSERT_TASK_Complex64_t - * - */ -void INSERT_TASK_zhemm(const RUNTIME_option_t *options, - cham_side_t side, cham_uplo_t uplo, - int m, int n, int nb, - CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, - const CHAM_desc_t *B, int Bm, int Bn, - CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn) +void INSERT_TASK_zhemm_Astat( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) +{ + if ( alpha == 0. ) { + return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, + beta, C, Cm, Cn ); + } + + struct cl_zhemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zhemm_Astat"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + /* Check A as write, since it will be the owner of the computation */ + CHAMELEON_ACCESS_W(A, Am, An); + CHAMELEON_ACCESS_R(B, Bm, Bn); + /* Check C as read, since it will be used in a reduction */ + CHAMELEON_ACCESS_R(C, Cm, Cn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_zhemm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zhemm_callback : NULL; + + /* Reduce the C access if needed */ + if ( beta == 0. ) { + accessC = STARPU_W; + } +#if defined(HAVE_STARPU_MPI_REDUX) + else if ( beta == 1. ) { + accessC = STARPU_MPI_REDUX; + } +#endif + else { + accessC = STARPU_RW; + } + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ + rt_starpu_insert_task( + &cl_zhemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_NODE, A->get_rankof(A, Am, An), +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + 0 ); +} + +void INSERT_TASK_zhemm( const RUNTIME_option_t *options, + cham_side_t side, cham_uplo_t uplo, + int m, int n, int nb, + CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, + const CHAM_desc_t *B, int Bm, int Bn, + CHAMELEON_Complex64_t beta, const CHAM_desc_t *C, int Cm, int Cn ) { if ( alpha == 0. ) { return INSERT_TASK_zlascal( options, ChamUpperLower, m, n, nb, beta, C, Cm, Cn ); } - (void)nb; - struct starpu_codelet *codelet = &cl_zhemm; - void (*callback)(void*) = options->profiling ? cl_zhemm_callback : NULL; - int accessC = ( beta == 0. ) ? STARPU_W : STARPU_RW; + struct cl_zhemm_args_s *clargs = NULL; + void (*callback)(void*); + int accessC; + int exec = 0; + char *cl_name = "zhemm"; + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zhemm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + + /* Callback for profiling information */ + callback = options->profiling ? cl_zhemm_callback : NULL; + + /* Reduce the C access if needed */ + accessC = ( beta == 0. ) ? STARPU_W : (STARPU_RW | ((beta == 1.) ? STARPU_COMMUTE : 0)); + +#if defined(CHAMELEON_KERNELS_TRACE) + { + char *cl_fullname; + chameleon_asprintf( &cl_fullname, "%s( %s, %s, %s )", cl_name, clargs->tileA->name, clargs->tileB->name, clargs->tileC->name ); + cl_name = cl_fullname; + } +#endif + + /* Insert the task */ rt_starpu_insert_task( - codelet, - STARPU_VALUE, &side, sizeof(int), - STARPU_VALUE, &uplo, sizeof(int), - STARPU_VALUE, &m, sizeof(int), - STARPU_VALUE, &n, sizeof(int), - STARPU_VALUE, &alpha, sizeof(CHAMELEON_Complex64_t), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - STARPU_VALUE, &beta, sizeof(CHAMELEON_Complex64_t), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, + &cl_zhemm, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zhemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, "zhemm", + STARPU_NAME, cl_name, #endif - 0); + 0 ); }