diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c515d7e5c5dfa0d0b0e4980a3336eb2ecdbb32b..b064cf15135af1b366763e1e8a78e9b903f02047 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -366,15 +366,6 @@ if(NOT CHAMELEON_SIMULATION) endif() if (CUDA_LIBRARIES) set_target_properties(CUDA::CUDA PROPERTIES INTERFACE_LINK_LIBRARIES "${CUDA_LIBRARIES}") - set(CMAKE_REQUIRED_INCLUDES "${CUDA_INCLUDE_DIRS}") - set(CMAKE_REQUIRED_LIBRARIES "${CUDA_LIBRARIES}") - if(CUDA_VERSION VERSION_LESS "4.0") - set(CUDA_HAVE_PEER_DEVICE_MEMORY_ACCESS 0) - else() - check_function_exists(cuDeviceCanAccessPeer CUDA_HAVE_PEER_DEVICE_MEMORY_ACCESS) - endif() - unset(CMAKE_REQUIRED_INCLUDES) - unset(CMAKE_REQUIRED_LIBRARIES) # Add cublas if found if (CUDA_CUBLAS_LIBRARIES) set_target_properties(CUDA::CUBLAS PROPERTIES INTERFACE_LINK_LIBRARIES "${CUDA_CUBLAS_LIBRARIES}") diff --git a/control/descriptor.h b/control/descriptor.h index 6bf6d9a81ad9cb57458207b500b45b8fdc0bdc3d..dda8fa71906169a3dde202e670e27c6c1c70467b 100644 --- a/control/descriptor.h +++ b/control/descriptor.h @@ -233,33 +233,41 @@ inline static int chameleon_desc_islocal( const CHAM_desc_t *A, int m, int n ) * CHAMELEON_ACCESS_RW(C, Cm, Cn) * CHAMELEON_END_ACCESS_DECLARATION */ -#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \ - unsigned __chameleon_need_submit = 0; \ +#define CHAMELEON_BEGIN_ACCESS_DECLARATION { \ + unsigned __chameleon_need_exec = 0; \ + unsigned __chameleon_need_submit = 0; \ RUNTIME_BEGIN_ACCESS_DECLARATION -#define CHAMELEON_ACCESS_R(A, Am, An) do { \ - if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \ - RUNTIME_ACCESS_R(A, Am, An); \ -} while(0) - -#define CHAMELEON_ACCESS_W(A, Am, An) do { \ - if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \ - RUNTIME_ACCESS_W(A, Am, An); \ -} while(0) - -#define CHAMELEON_ACCESS_RW(A, Am, An) do { \ - if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \ - RUNTIME_ACCESS_RW(A, Am, An); \ -} while(0) - -#define CHAMELEON_RANK_CHANGED(rank) do {\ - __chameleon_need_submit = 1; \ - RUNTIME_RANK_CHANGED(rank); \ -} while (0) - -#define CHAMELEON_END_ACCESS_DECLARATION \ - RUNTIME_END_ACCESS_DECLARATION; \ - if (!__chameleon_need_submit) return; \ +#define CHAMELEON_ACCESS_R(A, Am, An) do { \ + if (chameleon_desc_islocal(A, Am, An)) __chameleon_need_submit = 1; \ + RUNTIME_ACCESS_R(A, Am, An); \ + } while(0) + +#define CHAMELEON_ACCESS_W(A, Am, An) do { \ + if (chameleon_desc_islocal(A, Am, An)) { \ + __chameleon_need_exec = 1; \ + __chameleon_need_submit = 1; \ + } \ + RUNTIME_ACCESS_W(A, Am, An); \ + } while(0) + +#define CHAMELEON_ACCESS_RW(A, Am, An) do { \ + if (chameleon_desc_islocal(A, Am, An)) { \ + __chameleon_need_exec = 1; \ + __chameleon_need_submit = 1; \ + } \ + RUNTIME_ACCESS_RW(A, Am, An); \ + } while(0) + +#define CHAMELEON_RANK_CHANGED(rank) do { \ + __chameleon_need_submit = 1; \ + RUNTIME_RANK_CHANGED(rank); \ + } while (0) + +#define CHAMELEON_END_ACCESS_DECLARATION \ + RUNTIME_END_ACCESS_DECLARATION; \ + if (!__chameleon_need_submit) return; \ + (void)__chameleon_need_exec; \ } #ifdef __cplusplus diff --git a/runtime/starpu/codelets/codelet_zcesca.c b/runtime/starpu/codelets/codelet_zcesca.c index 5dbadec77774438ef701cb9d07b4f7af579094f1..1cf87fe7e330d4f636c648cb4a7886378e93a714 100644 --- a/runtime/starpu/codelets/codelet_zcesca.c +++ b/runtime/starpu/codelets/codelet_zcesca.c @@ -31,7 +31,7 @@ struct cl_zcesca_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zcesca_cpu_func(void *descr[], void *cl_arg) { - struct cl_zcesca_args_s clargs; + struct cl_zcesca_args_s *clargs = (struct cl_zcesca_args_s *)cl_arg; CHAM_tile_t *Gi; CHAM_tile_t *Gj; CHAM_tile_t *G; @@ -46,9 +46,8 @@ static void cl_zcesca_cpu_func(void *descr[], void *cl_arg) Dj = cti_interface_get(descr[4]); A = cti_interface_get(descr[5]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zcesca( clargs.center, clargs.scale, clargs.axis, - clargs.m, clargs.n, clargs.mt, clargs.nt, + TCORE_zcesca( clargs->center, clargs->scale, clargs->axis, + clargs->m, clargs->n, clargs->mt, clargs->nt, Gi, Gj, G, Di, Dj, A ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -68,19 +67,12 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options, const CHAM_desc_t *Dj, int Djm, int Djn, CHAM_desc_t *A, int Am, int An ) { - struct cl_zcesca_args_s clargs = { - .center = center, - .scale = scale, - .axis = axis, - .m = m, - .n = n, - .mt = mt, - .nt = nt - }; + struct cl_zcesca_args_s *clargs = NULL; struct starpu_codelet *codelet = &cl_zcesca; void (*callback)(void*) = options->profiling ? cl_zcesca_callback : NULL; starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + int exec = 0; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(Gi, Gim, Gin); @@ -89,11 +81,23 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options, CHAMELEON_ACCESS_R(Di, Dim, Din); CHAMELEON_ACCESS_R(Dj, Djm, Djn); CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zcesca_args_s ) ); + clargs->center = center; + clargs->scale = scale; + clargs->axis = axis; + clargs->m = m; + clargs->n = n; + clargs->mt = mt; + clargs->nt = nt; + } + rt_starpu_insert_task( codelet, - STARPU_VALUE, &clargs, sizeof(struct cl_zcesca_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zcesca_args_s), STARPU_R, RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin), STARPU_R, RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn), STARPU_R, RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn), diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 39d242c01e4d6bf4f41eeb58a22f1b1e94612393..e66b85dedb1c17a8bff8e8ceb49f18fd662060dd 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -43,7 +44,7 @@ struct cl_zgemm_args_s { static void cl_zgemm_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zgemm_args_s clargs; + struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -52,18 +53,17 @@ cl_zgemm_cpu_func( void *descr[], void *cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zgemm( clargs.transA, clargs.transB, - clargs.m, clargs.n, clargs.k, - clargs.alpha, tileA, tileB, - clargs.beta, tileC ); + TCORE_zgemm( clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + clargs->alpha, tileA, tileB, + clargs->beta, tileC ); } #ifdef CHAMELEON_USE_CUDA static void -cl_zgemm_cuda_func( void *descr[], void *_cl_arg ) +cl_zgemm_cuda_func( void *descr[], void *cl_arg ) { - struct cl_zgemm_args_s clargs; + struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -72,17 +72,19 @@ cl_zgemm_cuda_func( void *descr[], void *_cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args( _cl_arg, &clargs ); - RUNTIME_getStream( stream ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + CUDA_zgemm( - clargs.transA, clargs.transB, - clargs.m, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - (cuDoubleComplex*)&(clargs.beta), + (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -112,22 +114,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zgemm_args_s clargs = { - .transA = transA, - .transB = transB, - .m = m, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ) - }; + struct cl_zgemm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zgemm"; /* Handle cache */ @@ -135,8 +127,23 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback for profiling information */ callback = options->profiling ? cl_zgemm_callback : NULL; @@ -150,10 +157,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zgemm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zgemm_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index 5fbc6b09a9a1f2d733e9a4f52aecc7320a17b08f..5dad32c9afae12291dbc1729cd3527264d86cac8 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -43,7 +43,6 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) tileD = cti_interface_get(descr[1]); tileA = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV); TCORE_zgessm(m, n, k, ib, IPIV, tileD, tileA); } diff --git a/runtime/starpu/codelets/codelet_zgesum.c b/runtime/starpu/codelets/codelet_zgesum.c index 31a6ca93e5c8f9fc74a12e656309ca3059cb731d..c851c4ef555bc1ee792a712bfaa028795df4252c 100644 --- a/runtime/starpu/codelets/codelet_zgesum.c +++ b/runtime/starpu/codelets/codelet_zgesum.c @@ -27,15 +27,14 @@ struct cl_zgesum_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zgesum_cpu_func(void *descr[], void *cl_arg) { - struct cl_zgesum_args_s clargs; + struct cl_zgesum_args_s *clargs = (struct cl_zgesum_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileW; tileA = cti_interface_get(descr[0]); tileW = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zgesum( clargs.storev, clargs.m, clargs.n, tileA, tileW ); + TCORE_zgesum( clargs->storev, clargs->m, clargs->n, tileA, tileW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -49,24 +48,29 @@ void INSERT_TASK_zgesum( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) { - struct cl_zgesum_args_s clargs = { - .storev = storev, - .m = m, - .n = n - }; + struct cl_zgesum_args_s *clargs = NULL; struct starpu_codelet *codelet = &cl_zgesum; void (*callback)(void*) = options->profiling ? cl_zgesum_callback : NULL; starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + int exec = 0; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(SUMS, SUMSm, SUMSn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgesum_args_s ) ); + clargs->storev = storev; + clargs->m = m; + clargs->n = n; + } + rt_starpu_insert_task( codelet, - STARPU_VALUE, &clargs, sizeof(struct cl_zgesum_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgesum_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn), STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c index 91b62cec66d69b9ce8f6340b781a0b5ba167b99c..9c918d9ef5c2b37de710f2e766cf448de8bbfe99 100644 --- a/runtime/starpu/codelets/codelet_zhe2ge.c +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -35,7 +35,6 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N); TCORE_zhe2ge(uplo, M, N, tileA, tileB); } diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 1b8057b1409ef597ab99892a7482f811249d4620..3cc33f4236be6b6619f91c1032df22e787d11724 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c * @@ -41,38 +42,35 @@ struct cl_zherk_args_s { static void cl_zherk_cpu_func(void *descr[], void *cl_arg) { - struct cl_zherk_args_s clargs; + struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zherk( clargs.uplo, clargs.trans, clargs.n, clargs.k, - clargs.alpha, tileA, clargs.beta, tileC ); + TCORE_zherk( clargs->uplo, clargs->trans, clargs->n, clargs->k, + clargs->alpha, tileA, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { - struct cl_zherk_args_s clargs; + struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_zherk( - clargs.uplo, clargs.trans, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->uplo, clargs->trans, clargs->n, clargs->k, + &(clargs->alpha), tileA->mat, tileA->ld, - (cuDoubleComplex*)&(clargs.beta), + &(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -101,28 +99,33 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zherk_args_s clargs = { - .uplo = uplo, - .trans = trans, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ), - }; + struct cl_zherk_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zherk"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zherk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zherk_callback : NULL; @@ -136,7 +139,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zherk, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zherk_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zherk_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 2501eb14833810892e7a5ced85f1f7659eb1d933..b169055095f5d54d5f825921306f28150b4df275 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -41,20 +41,19 @@ struct cl_zlacpy_args_s { static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) { - struct cl_zlacpy_args_s clargs; + struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - assert( clargs.displA == 0 ); - assert( clargs.displB == 0 ); + assert( clargs->displA == 0 ); + assert( clargs->displB == 0 ); /* A = tileA->mat; */ /* B = tileB->mat; */ /* CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); */ - TCORE_zlacpy( clargs.uplo, clargs.m, clargs.n, tileA, tileB ); + TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -68,27 +67,32 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, int displA, const CHAM_desc_t *A, int Am, int An, int displB, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_zlacpy_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .displA = displA, - .displB = displB, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_zlacpy_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlacpy"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_W(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlacpy_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->displA = displA; + clargs->displB = displB; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlacpy_callback : NULL; @@ -99,7 +103,7 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlacpy, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index c2204d964180306a8fd833b9024e27556779d9b3..27358b138066d2344ecedd839fd9239f86fb706f 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -38,7 +38,6 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_zlag2c( m, n, tileA, tileB); } @@ -96,7 +95,6 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_clag2z( m, n, tileA, tileB); } diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index 0b9620fca00ed3d3fc13245e1f670b05e1261ea4..0dcaa52d54a4033c6b41629f23e98421a5ed72a5 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -36,13 +36,12 @@ struct cl_zlascal_args_s { static void cl_zlascal_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zlascal_args_s clargs; + struct cl_zlascal_args_s *clargs = (struct cl_zlascal_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlascal( clargs.uplo, clargs.m, clargs.n, clargs.alpha, tileA ); + TCORE_zlascal( clargs->uplo, clargs->m, clargs->n, clargs->alpha, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -65,24 +64,29 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, return; } - struct cl_zlascal_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlascal_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlascal"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlascal_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlascal_callback : NULL; @@ -93,7 +97,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlascal, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlascal_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlascal_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c index c3fb53b0fdb0b7732c69d785fbf2c381a4064e60..95805c0142f2b966125bac51b9290bcf0f324bdf 100644 --- a/runtime/starpu/codelets/codelet_zlaset.c +++ b/runtime/starpu/codelets/codelet_zlaset.c @@ -39,13 +39,12 @@ struct cl_zlaset_args_s { static void cl_zlaset_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zlaset_args_s clargs; + struct cl_zlaset_args_s *clargs = (struct cl_zlaset_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlaset( clargs.uplo, clargs.m, clargs.n, clargs.alpha, clargs.beta, tileA ); + TCORE_zlaset( clargs->uplo, clargs->m, clargs->n, clargs->alpha, clargs->beta, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -59,25 +58,30 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, const CHAM_desc_t *A, int Am, int An ) { - struct cl_zlaset_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .alpha = alpha, - .beta = beta, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlaset_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlaset"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlaset_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlaset_callback : NULL; @@ -88,7 +92,7 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlaset, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlaset_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaset_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 740fdc0fba5bbdcde1ee0258821013d59836ed4b..700352172f44c4f3fdbda09f0474d073cb44bc44 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -37,13 +37,12 @@ struct cl_zlauum_args_s { static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) { - struct cl_zlauum_args_s clargs; + struct cl_zlauum_args_s *clargs = (struct cl_zlauum_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlauum( clargs.uplo, clargs.n, tileA ); + TCORE_zlauum( clargs->uplo, clargs->n, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,22 +55,27 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, const CHAM_desc_t *A, int Am, int An ) { - struct cl_zlauum_args_s clargs = { - .uplo = uplo, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlauum_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlauum"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlauum_args_s ) ); + clargs->uplo = uplo; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlauum_callback : NULL; @@ -82,7 +86,7 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlauum, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlauum_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlauum_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c index a58aed69b013d4df222b681f7bee42275b98bf53..68bf93fc960e44beeed0468da28694d82562fce5 100644 --- a/runtime/starpu/codelets/codelet_zplghe.c +++ b/runtime/starpu/codelets/codelet_zplghe.c @@ -41,14 +41,13 @@ struct cl_zplghe_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplghe_args_s clargs; + struct cl_zplghe_args_s *clargs = (struct cl_zplghe_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplghe( clargs.bump, clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplghe( clargs->bump, clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,27 +60,32 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplghe_args_s clargs = { - .bump = bump, - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplghe_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplghe"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplghe_args_s ) ); + clargs->bump = bump; + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplghe_callback : NULL; @@ -92,7 +96,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplghe, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplghe_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplghe_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c index f3afa23ebea541f9989767f834a1900367ae24ff..2d342bac33f25e4e3e5ff60be624fe0d81b798c5 100644 --- a/runtime/starpu/codelets/codelet_zplgsy.c +++ b/runtime/starpu/codelets/codelet_zplgsy.c @@ -41,14 +41,13 @@ struct cl_zplgsy_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplgsy_args_s clargs; + struct cl_zplgsy_args_s *clargs = (struct cl_zplgsy_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplgsy( clargs.bump, clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplgsy( clargs->bump, clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,27 +60,32 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplgsy_args_s clargs = { - .bump = bump, - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplgsy_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplgsy"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplgsy_args_s ) ); + clargs->bump = bump; + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplgsy_callback : NULL; @@ -92,7 +96,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplgsy, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplgsy_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplgsy_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index 05df0d107f8b46bbb6df07891e5373a9308b8f1a..36191f43e2486467220f6ee4cda694cd857c1d4a 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -41,14 +41,13 @@ struct cl_zplrnt_args_s { static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplrnt_args_s clargs; + struct cl_zplrnt_args_s *clargs = (struct cl_zplrnt_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplrnt( clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplrnt( clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,26 +60,31 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplrnt_args_s clargs = { - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplrnt_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplrnt"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplrnt_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplrnt_callback : NULL; @@ -91,7 +95,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplrnt, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplrnt_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplrnt_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 19da18c46dcac24a734ffcca15693bf13c0682ae..883645b31185692edbac2bba01005f290a738ce7 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -40,17 +40,16 @@ struct cl_zpotrf_args_s { static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) { - struct cl_zpotrf_args_s clargs; + struct cl_zpotrf_args_s *clargs = (struct cl_zpotrf_args_s *)cl_arg; CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zpotrf( clargs.uplo, clargs.n, tileA, &info ); + TCORE_zpotrf( clargs->uplo, clargs->n, tileA, &info ); - if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); + if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -65,25 +64,30 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - struct cl_zpotrf_args_s clargs = { - .uplo = uplo, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .iinfo = iinfo, - .sequence = options->sequence, - .request = options->request, - }; + struct cl_zpotrf_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zpotrf"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zpotrf_args_s ) ); + clargs->uplo = uplo; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zpotrf_callback : NULL; @@ -94,7 +98,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zpotrf, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zpotrf_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zpotrf_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 9af6d9a824498c908b98eb08e8bf1dcd50c665c0..a878aaa19adfc5c60ece27a785a7dd2fd1e67986 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -41,38 +42,35 @@ struct cl_zsyrk_args_s { static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) { - struct cl_zsyrk_args_s clargs; + struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zsyrk( clargs.uplo, clargs.trans, clargs.n, clargs.k, - clargs.alpha, tileA, clargs.beta, tileC ); + TCORE_zsyrk( clargs->uplo, clargs->trans, clargs->n, clargs->k, + clargs->alpha, tileA, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { - struct cl_zsyrk_args_s clargs; + struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_zsyrk( - clargs.uplo, clargs.trans, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->uplo, clargs->trans, clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, - (cuDoubleComplex*)&(clargs.beta), + (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -101,28 +99,33 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zsyrk_args_s clargs = { - .uplo = uplo, - .trans = trans, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ), - }; + struct cl_zsyrk_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zsyrk"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zsyrk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zsyrk_callback : NULL; @@ -135,9 +138,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, /* Insert the task */ rt_starpu_insert_task( &cl_zsyrk, - /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zsyrk_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyrk_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index 6555e6d1737be7bb41244a6539d02c8583f3b072..d59f03a5f3e4e9695c728f4eeb142add91e5192f 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -38,16 +38,15 @@ struct cl_ztradd_args_s { static void cl_ztradd_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztradd_args_s clargs; + struct cl_ztradd_args_s *clargs = (struct cl_ztradd_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztradd( clargs.uplo, clargs.trans, clargs.m, clargs.n, - clargs.alpha, tileA, clargs.beta, tileB ); + TCORE_ztradd( clargs->uplo, clargs->trans, clargs->m, clargs->n, + clargs->alpha, tileA, clargs->beta, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -66,28 +65,33 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, beta, B, Bm, Bn ); } - struct cl_ztradd_args_s clargs = { - .uplo = uplo, - .trans = trans, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztradd_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessB; + int exec = 0; char *cl_name = "ztradd"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztradd_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztradd_callback : NULL; @@ -101,7 +105,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztradd, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztradd_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztradd_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index ac31d898f7e253f3fde11b9f4df37875480eaa5b..20f86bd38af1f4842aa6e853392f00ed7adfbea3 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -42,37 +42,34 @@ struct cl_ztrmm_args_s { static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrmm_args_s clargs; + struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrmm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, clargs.alpha, tileA, tileB ); + TCORE_ztrmm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { - struct cl_ztrmm_args_s clargs; + struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_ztrmm( - clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, - (cuDoubleComplex*)&(clargs.alpha), + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -97,29 +94,34 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrmm_args_s clargs = { - .side = side, - .uplo = uplo, - .transA = transA, - .diag = diag, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztrmm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrmm"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrmm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrmm_callback : NULL; @@ -130,7 +132,7 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrmm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrmm_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrmm_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 00b45f3ebc2ce576eaaa387a055ea01099299851..e40dde763ed018a29003e39bc1b5ee377fbeb4df 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -42,37 +43,34 @@ struct cl_ztrsm_args_s { static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrsm_args_s clargs; + struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrsm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, clargs.alpha, tileA, tileB ); + TCORE_ztrsm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { - struct cl_ztrsm_args_s clargs; + struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_ztrsm( - clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, - (cuDoubleComplex*)&(clargs.alpha), + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -97,29 +95,34 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrsm_args_s clargs = { - .side = side, - .uplo = uplo, - .transA = transA, - .diag = diag, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztrsm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrsm"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrsm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrsm_callback : NULL; @@ -130,7 +133,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrsm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrsm_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrsm_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 157fa8c63a6a1d97af856659d42065a6792b7f29..9732e84c36f4c88c310ee35a8028fc66d4c7b572 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -41,17 +41,16 @@ struct cl_ztrtri_args_s { static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrtri_args_s clargs; + struct cl_ztrtri_args_s *clargs = (struct cl_ztrtri_args_s *)cl_arg; CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrtri( clargs.uplo, clargs.diag, clargs.n, tileA, &info ); + TCORE_ztrtri( clargs->uplo, clargs->diag, clargs->n, tileA, &info ); - if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); + if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -66,26 +65,31 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - struct cl_ztrtri_args_s clargs = { - .uplo = uplo, - .diag = diag, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .iinfo = iinfo, - .sequence = options->sequence, - .request = options->request, - }; + struct cl_ztrtri_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrtri"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrtri_args_s ) ); + clargs->uplo = uplo; + clargs->diag = diag; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrtri_callback : NULL; @@ -96,7 +100,7 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrtri, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrtri_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrtri_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */