From fba5d13232864526e1c12c7367a4733a9913c375 Mon Sep 17 00:00:00 2001 From: Mathieu Faverge <mathieu.faverge@inria.fr> Date: Mon, 11 Oct 2021 22:19:06 +0200 Subject: [PATCH] Switch to codelet argument allocation within chameleon --- runtime/starpu/codelets/codelet_zcesca.c | 32 ++++++----- runtime/starpu/codelets/codelet_zgemm.c | 69 +++++++++++++---------- runtime/starpu/codelets/codelet_zgessm.c | 1 - runtime/starpu/codelets/codelet_zgesum.c | 22 +++++--- runtime/starpu/codelets/codelet_zhe2ge.c | 1 - runtime/starpu/codelets/codelet_zherk.c | 45 ++++++++------- runtime/starpu/codelets/codelet_zlacpy.c | 34 ++++++----- runtime/starpu/codelets/codelet_zlag2c.c | 2 - runtime/starpu/codelets/codelet_zlascal.c | 26 +++++---- runtime/starpu/codelets/codelet_zlaset.c | 28 +++++---- runtime/starpu/codelets/codelet_zlauum.c | 22 +++++--- runtime/starpu/codelets/codelet_zplghe.c | 34 ++++++----- runtime/starpu/codelets/codelet_zplgsy.c | 34 ++++++----- runtime/starpu/codelets/codelet_zplrnt.c | 32 ++++++----- runtime/starpu/codelets/codelet_zpotrf.c | 32 ++++++----- runtime/starpu/codelets/codelet_zsyrk.c | 46 +++++++-------- runtime/starpu/codelets/codelet_ztradd.c | 34 ++++++----- runtime/starpu/codelets/codelet_ztrmm.c | 46 +++++++-------- runtime/starpu/codelets/codelet_ztrsm.c | 47 +++++++-------- runtime/starpu/codelets/codelet_ztrtri.c | 34 ++++++----- 20 files changed, 342 insertions(+), 279 deletions(-) diff --git a/runtime/starpu/codelets/codelet_zcesca.c b/runtime/starpu/codelets/codelet_zcesca.c index 5dbadec77..1cf87fe7e 100644 --- a/runtime/starpu/codelets/codelet_zcesca.c +++ b/runtime/starpu/codelets/codelet_zcesca.c @@ -31,7 +31,7 @@ struct cl_zcesca_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zcesca_cpu_func(void *descr[], void *cl_arg) { - struct cl_zcesca_args_s clargs; + struct cl_zcesca_args_s *clargs = (struct cl_zcesca_args_s *)cl_arg; CHAM_tile_t *Gi; CHAM_tile_t *Gj; CHAM_tile_t *G; @@ -46,9 +46,8 @@ static void cl_zcesca_cpu_func(void *descr[], void *cl_arg) Dj = cti_interface_get(descr[4]); A = cti_interface_get(descr[5]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zcesca( clargs.center, clargs.scale, clargs.axis, - clargs.m, clargs.n, clargs.mt, clargs.nt, + TCORE_zcesca( clargs->center, clargs->scale, clargs->axis, + clargs->m, clargs->n, clargs->mt, clargs->nt, Gi, Gj, G, Di, Dj, A ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -68,19 +67,12 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options, const CHAM_desc_t *Dj, int Djm, int Djn, CHAM_desc_t *A, int Am, int An ) { - struct cl_zcesca_args_s clargs = { - .center = center, - .scale = scale, - .axis = axis, - .m = m, - .n = n, - .mt = mt, - .nt = nt - }; + struct cl_zcesca_args_s *clargs = NULL; struct starpu_codelet *codelet = &cl_zcesca; void (*callback)(void*) = options->profiling ? cl_zcesca_callback : NULL; starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + int exec = 0; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(Gi, Gim, Gin); @@ -89,11 +81,23 @@ void INSERT_TASK_zcesca( const RUNTIME_option_t *options, CHAMELEON_ACCESS_R(Di, Dim, Din); CHAMELEON_ACCESS_R(Dj, Djm, Djn); CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zcesca_args_s ) ); + clargs->center = center; + clargs->scale = scale; + clargs->axis = axis; + clargs->m = m; + clargs->n = n; + clargs->mt = mt; + clargs->nt = nt; + } + rt_starpu_insert_task( codelet, - STARPU_VALUE, &clargs, sizeof(struct cl_zcesca_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zcesca_args_s), STARPU_R, RTBLKADDR(Gi, CHAMELEON_Complex64_t, Gim, Gin), STARPU_R, RTBLKADDR(Gj, CHAMELEON_Complex64_t, Gjm, Gjn), STARPU_R, RTBLKADDR(G, CHAMELEON_Complex64_t, Gm, Gn), diff --git a/runtime/starpu/codelets/codelet_zgemm.c b/runtime/starpu/codelets/codelet_zgemm.c index 39d242c01..e66b85ded 100644 --- a/runtime/starpu/codelets/codelet_zgemm.c +++ b/runtime/starpu/codelets/codelet_zgemm.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -43,7 +44,7 @@ struct cl_zgemm_args_s { static void cl_zgemm_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zgemm_args_s clargs; + struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -52,18 +53,17 @@ cl_zgemm_cpu_func( void *descr[], void *cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zgemm( clargs.transA, clargs.transB, - clargs.m, clargs.n, clargs.k, - clargs.alpha, tileA, tileB, - clargs.beta, tileC ); + TCORE_zgemm( clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + clargs->alpha, tileA, tileB, + clargs->beta, tileC ); } #ifdef CHAMELEON_USE_CUDA static void -cl_zgemm_cuda_func( void *descr[], void *_cl_arg ) +cl_zgemm_cuda_func( void *descr[], void *cl_arg ) { - struct cl_zgemm_args_s clargs; + struct cl_zgemm_args_s *clargs = (struct cl_zgemm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; CHAM_tile_t *tileC; @@ -72,17 +72,19 @@ cl_zgemm_cuda_func( void *descr[], void *_cl_arg ) tileB = cti_interface_get(descr[1]); tileC = cti_interface_get(descr[2]); - starpu_codelet_unpack_args( _cl_arg, &clargs ); - RUNTIME_getStream( stream ); + assert( tileA->format & CHAMELEON_TILE_FULLRANK ); + assert( tileB->format & CHAMELEON_TILE_FULLRANK ); + assert( tileC->format & CHAMELEON_TILE_FULLRANK ); + CUDA_zgemm( - clargs.transA, clargs.transB, - clargs.m, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->transA, clargs->transB, + clargs->m, clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, - (cuDoubleComplex*)&(clargs.beta), + (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -112,22 +114,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zgemm_args_s clargs = { - .transA = transA, - .transB = transB, - .m = m, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ) - }; + struct cl_zgemm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zgemm"; /* Handle cache */ @@ -135,8 +127,23 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_R(B, Bm, Bn); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgemm_args_s ) ); + clargs->transA = transA; + clargs->transB = transB; + clargs->m = m; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback for profiling information */ callback = options->profiling ? cl_zgemm_callback : NULL; @@ -150,10 +157,12 @@ void INSERT_TASK_zgemm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zgemm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zgemm_args_s), - STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), - STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), - accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgemm_args_s), + + /* Task handles */ + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_R, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), /* Common task arguments */ STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zgessm.c b/runtime/starpu/codelets/codelet_zgessm.c index 5fbc6b09a..5dad32c9a 100644 --- a/runtime/starpu/codelets/codelet_zgessm.c +++ b/runtime/starpu/codelets/codelet_zgessm.c @@ -43,7 +43,6 @@ static void cl_zgessm_cpu_func(void *descr[], void *cl_arg) tileD = cti_interface_get(descr[1]); tileA = cti_interface_get(descr[2]); - starpu_codelet_unpack_args(cl_arg, &m, &n, &k, &ib, &IPIV); TCORE_zgessm(m, n, k, ib, IPIV, tileD, tileA); } diff --git a/runtime/starpu/codelets/codelet_zgesum.c b/runtime/starpu/codelets/codelet_zgesum.c index 31a6ca93e..c851c4ef5 100644 --- a/runtime/starpu/codelets/codelet_zgesum.c +++ b/runtime/starpu/codelets/codelet_zgesum.c @@ -27,15 +27,14 @@ struct cl_zgesum_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zgesum_cpu_func(void *descr[], void *cl_arg) { - struct cl_zgesum_args_s clargs; + struct cl_zgesum_args_s *clargs = (struct cl_zgesum_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileW; tileA = cti_interface_get(descr[0]); tileW = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zgesum( clargs.storev, clargs.m, clargs.n, tileA, tileW ); + TCORE_zgesum( clargs->storev, clargs->m, clargs->n, tileA, tileW ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -49,24 +48,29 @@ void INSERT_TASK_zgesum( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *SUMS, int SUMSm, int SUMSn ) { - struct cl_zgesum_args_s clargs = { - .storev = storev, - .m = m, - .n = n - }; + struct cl_zgesum_args_s *clargs = NULL; struct starpu_codelet *codelet = &cl_zgesum; void (*callback)(void*) = options->profiling ? cl_zgesum_callback : NULL; starpu_option_request_t* schedopt = (starpu_option_request_t *)(options->request->schedopt); int workerid = (schedopt == NULL) ? -1 : schedopt->workerid; + int exec = 0; CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(SUMS, SUMSm, SUMSn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zgesum_args_s ) ); + clargs->storev = storev; + clargs->m = m; + clargs->n = n; + } + rt_starpu_insert_task( codelet, - STARPU_VALUE, &clargs, sizeof(struct cl_zgesum_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zgesum_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(SUMS, CHAMELEON_Complex64_t, SUMSm, SUMSn), STARPU_PRIORITY, options->priority, diff --git a/runtime/starpu/codelets/codelet_zhe2ge.c b/runtime/starpu/codelets/codelet_zhe2ge.c index 91b62cec6..9c918d9ef 100644 --- a/runtime/starpu/codelets/codelet_zhe2ge.c +++ b/runtime/starpu/codelets/codelet_zhe2ge.c @@ -35,7 +35,6 @@ static void cl_zhe2ge_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &uplo, &M, &N); TCORE_zhe2ge(uplo, M, N, tileA, tileB); } diff --git a/runtime/starpu/codelets/codelet_zherk.c b/runtime/starpu/codelets/codelet_zherk.c index 1b8057b14..3cc33f423 100644 --- a/runtime/starpu/codelets/codelet_zherk.c +++ b/runtime/starpu/codelets/codelet_zherk.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c * @@ -41,38 +42,35 @@ struct cl_zherk_args_s { static void cl_zherk_cpu_func(void *descr[], void *cl_arg) { - struct cl_zherk_args_s clargs; + struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zherk( clargs.uplo, clargs.trans, clargs.n, clargs.k, - clargs.alpha, tileA, clargs.beta, tileC ); + TCORE_zherk( clargs->uplo, clargs->trans, clargs->n, clargs->k, + clargs->alpha, tileA, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zherk_cuda_func(void *descr[], void *cl_arg) { - struct cl_zherk_args_s clargs; + struct cl_zherk_args_s *clargs = (struct cl_zherk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_zherk( - clargs.uplo, clargs.trans, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->uplo, clargs->trans, clargs->n, clargs->k, + &(clargs->alpha), tileA->mat, tileA->ld, - (cuDoubleComplex*)&(clargs.beta), + &(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -101,28 +99,33 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zherk_args_s clargs = { - .uplo = uplo, - .trans = trans, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ), - }; + struct cl_zherk_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zherk"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zherk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zherk_callback : NULL; @@ -136,7 +139,7 @@ void INSERT_TASK_zherk( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zherk, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zherk_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zherk_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 2501eb148..b16905509 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -41,20 +41,19 @@ struct cl_zlacpy_args_s { static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) { - struct cl_zlacpy_args_s clargs; + struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - assert( clargs.displA == 0 ); - assert( clargs.displB == 0 ); + assert( clargs->displA == 0 ); + assert( clargs->displB == 0 ); /* A = tileA->mat; */ /* B = tileB->mat; */ /* CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); */ - TCORE_zlacpy( clargs.uplo, clargs.m, clargs.n, tileA, tileB ); + TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -68,27 +67,32 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, int displA, const CHAM_desc_t *A, int Am, int An, int displB, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_zlacpy_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .displA = displA, - .displB = displB, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_zlacpy_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlacpy"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_W(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlacpy_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->displA = displA; + clargs->displB = displB; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlacpy_callback : NULL; @@ -99,7 +103,7 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlacpy, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_zlag2c.c b/runtime/starpu/codelets/codelet_zlag2c.c index c2204d964..27358b138 100644 --- a/runtime/starpu/codelets/codelet_zlag2c.c +++ b/runtime/starpu/codelets/codelet_zlag2c.c @@ -38,7 +38,6 @@ static void cl_zlag2c_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_zlag2c( m, n, tileA, tileB); } @@ -96,7 +95,6 @@ static void cl_clag2z_cpu_func(void *descr[], void *cl_arg) tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args(cl_arg, &m, &n); TCORE_clag2z( m, n, tileA, tileB); } diff --git a/runtime/starpu/codelets/codelet_zlascal.c b/runtime/starpu/codelets/codelet_zlascal.c index 0b9620fca..0dcaa52d5 100644 --- a/runtime/starpu/codelets/codelet_zlascal.c +++ b/runtime/starpu/codelets/codelet_zlascal.c @@ -36,13 +36,12 @@ struct cl_zlascal_args_s { static void cl_zlascal_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zlascal_args_s clargs; + struct cl_zlascal_args_s *clargs = (struct cl_zlascal_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlascal( clargs.uplo, clargs.m, clargs.n, clargs.alpha, tileA ); + TCORE_zlascal( clargs->uplo, clargs->m, clargs->n, clargs->alpha, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -65,24 +64,29 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, return; } - struct cl_zlascal_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlascal_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlascal"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlascal_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlascal_callback : NULL; @@ -93,7 +97,7 @@ void INSERT_TASK_zlascal( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlascal, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlascal_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlascal_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zlaset.c b/runtime/starpu/codelets/codelet_zlaset.c index c3fb53b0f..95805c014 100644 --- a/runtime/starpu/codelets/codelet_zlaset.c +++ b/runtime/starpu/codelets/codelet_zlaset.c @@ -39,13 +39,12 @@ struct cl_zlaset_args_s { static void cl_zlaset_cpu_func( void *descr[], void *cl_arg ) { - struct cl_zlaset_args_s clargs; + struct cl_zlaset_args_s *clargs = (struct cl_zlaset_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlaset( clargs.uplo, clargs.m, clargs.n, clargs.alpha, clargs.beta, tileA ); + TCORE_zlaset( clargs->uplo, clargs->m, clargs->n, clargs->alpha, clargs->beta, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -59,25 +58,30 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, CHAMELEON_Complex64_t beta, const CHAM_desc_t *A, int Am, int An ) { - struct cl_zlaset_args_s clargs = { - .uplo = uplo, - .m = m, - .n = n, - .alpha = alpha, - .beta = beta, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlaset_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlaset"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlaset_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->beta = beta; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlaset_callback : NULL; @@ -88,7 +92,7 @@ void INSERT_TASK_zlaset( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlaset, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlaset_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlaset_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zlauum.c b/runtime/starpu/codelets/codelet_zlauum.c index 740fdc0fb..700352172 100644 --- a/runtime/starpu/codelets/codelet_zlauum.c +++ b/runtime/starpu/codelets/codelet_zlauum.c @@ -37,13 +37,12 @@ struct cl_zlauum_args_s { static void cl_zlauum_cpu_func(void *descr[], void *cl_arg) { - struct cl_zlauum_args_s clargs; + struct cl_zlauum_args_s *clargs = (struct cl_zlauum_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zlauum( clargs.uplo, clargs.n, tileA ); + TCORE_zlauum( clargs->uplo, clargs->n, tileA ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -56,22 +55,27 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, cham_uplo_t uplo, int n, int nb, const CHAM_desc_t *A, int Am, int An ) { - struct cl_zlauum_args_s clargs = { - .uplo = uplo, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - }; + struct cl_zlauum_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zlauum"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlauum_args_s ) ); + clargs->uplo = uplo; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zlauum_callback : NULL; @@ -82,7 +86,7 @@ void INSERT_TASK_zlauum( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zlauum, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zlauum_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlauum_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplghe.c b/runtime/starpu/codelets/codelet_zplghe.c index a58aed69b..68bf93fc9 100644 --- a/runtime/starpu/codelets/codelet_zplghe.c +++ b/runtime/starpu/codelets/codelet_zplghe.c @@ -41,14 +41,13 @@ struct cl_zplghe_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zplghe_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplghe_args_s clargs; + struct cl_zplghe_args_s *clargs = (struct cl_zplghe_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplghe( clargs.bump, clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplghe( clargs->bump, clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,27 +60,32 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, double bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplghe_args_s clargs = { - .bump = bump, - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplghe_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplghe"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplghe_args_s ) ); + clargs->bump = bump; + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplghe_callback : NULL; @@ -92,7 +96,7 @@ void INSERT_TASK_zplghe( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplghe, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplghe_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplghe_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplgsy.c b/runtime/starpu/codelets/codelet_zplgsy.c index f3afa23eb..2d342bac3 100644 --- a/runtime/starpu/codelets/codelet_zplgsy.c +++ b/runtime/starpu/codelets/codelet_zplgsy.c @@ -41,14 +41,13 @@ struct cl_zplgsy_args_s { #if !defined(CHAMELEON_SIMULATION) static void cl_zplgsy_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplgsy_args_s clargs; + struct cl_zplgsy_args_s *clargs = (struct cl_zplgsy_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplgsy( clargs.bump, clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplgsy( clargs->bump, clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,27 +60,32 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, CHAMELEON_Complex64_t bump, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplgsy_args_s clargs = { - .bump = bump, - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplgsy_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplgsy"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplgsy_args_s ) ); + clargs->bump = bump; + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplgsy_callback : NULL; @@ -92,7 +96,7 @@ void INSERT_TASK_zplgsy( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplgsy, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplgsy_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplgsy_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zplrnt.c b/runtime/starpu/codelets/codelet_zplrnt.c index 05df0d107..36191f43e 100644 --- a/runtime/starpu/codelets/codelet_zplrnt.c +++ b/runtime/starpu/codelets/codelet_zplrnt.c @@ -41,14 +41,13 @@ struct cl_zplrnt_args_s { static void cl_zplrnt_cpu_func(void *descr[], void *cl_arg) { - struct cl_zplrnt_args_s clargs; + struct cl_zplrnt_args_s *clargs = (struct cl_zplrnt_args_s *)cl_arg; CHAM_tile_t *tileA; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zplrnt( clargs.m, clargs.n, tileA, - clargs.bigM, clargs.m0, clargs.n0, clargs.seed ); + TCORE_zplrnt( clargs->m, clargs->n, tileA, + clargs->bigM, clargs->m0, clargs->n0, clargs->seed ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -61,26 +60,31 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, int m, int n, const CHAM_desc_t *A, int Am, int An, int bigM, int m0, int n0, unsigned long long int seed ) { - struct cl_zplrnt_args_s clargs = { - .m = m, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .bigM = bigM, - .m0 = m0, - .n0 = n0, - .seed = seed, - }; + struct cl_zplrnt_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zplrnt"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_W(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zplrnt_args_s ) ); + clargs->m = m; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->bigM = bigM; + clargs->m0 = m0; + clargs->n0 = n0; + clargs->seed = seed; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zplrnt_callback : NULL; @@ -91,7 +95,7 @@ void INSERT_TASK_zplrnt( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zplrnt, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zplrnt_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zplrnt_args_s), STARPU_W, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zpotrf.c b/runtime/starpu/codelets/codelet_zpotrf.c index 19da18c46..883645b31 100644 --- a/runtime/starpu/codelets/codelet_zpotrf.c +++ b/runtime/starpu/codelets/codelet_zpotrf.c @@ -40,17 +40,16 @@ struct cl_zpotrf_args_s { static void cl_zpotrf_cpu_func(void *descr[], void *cl_arg) { - struct cl_zpotrf_args_s clargs; + struct cl_zpotrf_args_s *clargs = (struct cl_zpotrf_args_s *)cl_arg; CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zpotrf( clargs.uplo, clargs.n, tileA, &info ); + TCORE_zpotrf( clargs->uplo, clargs->n, tileA, &info ); - if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); + if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -65,25 +64,30 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - struct cl_zpotrf_args_s clargs = { - .uplo = uplo, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .iinfo = iinfo, - .sequence = options->sequence, - .request = options->request, - }; + struct cl_zpotrf_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "zpotrf"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zpotrf_args_s ) ); + clargs->uplo = uplo; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zpotrf_callback : NULL; @@ -94,7 +98,7 @@ void INSERT_TASK_zpotrf( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_zpotrf, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zpotrf_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zpotrf_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ diff --git a/runtime/starpu/codelets/codelet_zsyrk.c b/runtime/starpu/codelets/codelet_zsyrk.c index 9af6d9a82..a878aaa19 100644 --- a/runtime/starpu/codelets/codelet_zsyrk.c +++ b/runtime/starpu/codelets/codelet_zsyrk.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -41,38 +42,35 @@ struct cl_zsyrk_args_s { static void cl_zsyrk_cpu_func(void *descr[], void *cl_arg) { - struct cl_zsyrk_args_s clargs; + struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_zsyrk( clargs.uplo, clargs.trans, clargs.n, clargs.k, - clargs.alpha, tileA, clargs.beta, tileC ); + TCORE_zsyrk( clargs->uplo, clargs->trans, clargs->n, clargs->k, + clargs->alpha, tileA, clargs->beta, tileC ); } #if defined(CHAMELEON_USE_CUDA) static void cl_zsyrk_cuda_func(void *descr[], void *cl_arg) { - struct cl_zsyrk_args_s clargs; + struct cl_zsyrk_args_s *clargs = (struct cl_zsyrk_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileC; tileA = cti_interface_get(descr[0]); tileC = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_zsyrk( - clargs.uplo, clargs.trans, clargs.n, clargs.k, - (cuDoubleComplex*)&(clargs.alpha), + clargs->uplo, clargs->trans, clargs->n, clargs->k, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, - (cuDoubleComplex*)&(clargs.beta), + (cuDoubleComplex*)&(clargs->beta), tileC->mat, tileC->ld, stream ); @@ -101,28 +99,33 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, beta, C, Cm, Cn ); } - struct cl_zsyrk_args_s clargs = { - .uplo = uplo, - .trans = trans, - .n = n, - .k = k, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileC = C->get_blktile( C, Cm, Cn ), - }; + struct cl_zsyrk_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessC; + int exec = 0; char *cl_name = "zsyrk"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(C, Cm, Cn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_zsyrk_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->n = n; + clargs->k = k; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileC = C->get_blktile( C, Cm, Cn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_zsyrk_callback : NULL; @@ -135,9 +138,8 @@ void INSERT_TASK_zsyrk( const RUNTIME_option_t *options, /* Insert the task */ rt_starpu_insert_task( &cl_zsyrk, - /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_zsyrk_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_zsyrk_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessC, RTBLKADDR(C, CHAMELEON_Complex64_t, Cm, Cn), diff --git a/runtime/starpu/codelets/codelet_ztradd.c b/runtime/starpu/codelets/codelet_ztradd.c index 6555e6d17..d59f03a5f 100644 --- a/runtime/starpu/codelets/codelet_ztradd.c +++ b/runtime/starpu/codelets/codelet_ztradd.c @@ -38,16 +38,15 @@ struct cl_ztradd_args_s { static void cl_ztradd_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztradd_args_s clargs; + struct cl_ztradd_args_s *clargs = (struct cl_ztradd_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztradd( clargs.uplo, clargs.trans, clargs.m, clargs.n, - clargs.alpha, tileA, clargs.beta, tileB ); + TCORE_ztradd( clargs->uplo, clargs->trans, clargs->m, clargs->n, + clargs->alpha, tileA, clargs->beta, tileB ); } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -66,28 +65,33 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, beta, B, Bm, Bn ); } - struct cl_ztradd_args_s clargs = { - .uplo = uplo, - .trans = trans, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .beta = beta, - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztradd_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid, accessB; + int exec = 0; char *cl_name = "ztradd"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztradd_args_s ) ); + clargs->uplo = uplo; + clargs->trans = trans; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->beta = beta; + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztradd_callback : NULL; @@ -101,7 +105,7 @@ void INSERT_TASK_ztradd( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztradd, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztradd_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztradd_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), accessB, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrmm.c b/runtime/starpu/codelets/codelet_ztrmm.c index ac31d898f..20f86bd38 100644 --- a/runtime/starpu/codelets/codelet_ztrmm.c +++ b/runtime/starpu/codelets/codelet_ztrmm.c @@ -42,37 +42,34 @@ struct cl_ztrmm_args_s { static void cl_ztrmm_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrmm_args_s clargs; + struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrmm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, clargs.alpha, tileA, tileB ); + TCORE_ztrmm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA static void cl_ztrmm_cuda_func(void *descr[], void *cl_arg) { - struct cl_ztrmm_args_s clargs; + struct cl_ztrmm_args_s *clargs = (struct cl_ztrmm_args_s *)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_ztrmm( - clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, - (cuDoubleComplex*)&(clargs.alpha), + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -97,29 +94,34 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrmm_args_s clargs = { - .side = side, - .uplo = uplo, - .transA = transA, - .diag = diag, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztrmm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrmm"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrmm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrmm_callback : NULL; @@ -130,7 +132,7 @@ void INSERT_TASK_ztrmm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrmm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrmm_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrmm_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrsm.c b/runtime/starpu/codelets/codelet_ztrsm.c index 00b45f3eb..e40dde763 100644 --- a/runtime/starpu/codelets/codelet_ztrsm.c +++ b/runtime/starpu/codelets/codelet_ztrsm.c @@ -19,6 +19,7 @@ * @author Cedric Castagnede * @author Lucas Barros de Assis * @author Florent Pruvost + * @author Gwenole Lucas * @date 2021-03-16 * @precisions normal z -> c d s * @@ -42,37 +43,34 @@ struct cl_ztrsm_args_s { static void cl_ztrsm_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrsm_args_s clargs; + struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrsm( clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, clargs.alpha, tileA, tileB ); + TCORE_ztrsm( clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, clargs->alpha, tileA, tileB ); } #ifdef CHAMELEON_USE_CUDA static void cl_ztrsm_cuda_func(void *descr[], void *cl_arg) { - struct cl_ztrsm_args_s clargs; + struct cl_ztrsm_args_s *clargs = (struct cl_ztrsm_args_s*)cl_arg; CHAM_tile_t *tileA; CHAM_tile_t *tileB; tileA = cti_interface_get(descr[0]); tileB = cti_interface_get(descr[1]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - RUNTIME_getStream(stream); CUDA_ztrsm( - clargs.side, clargs.uplo, clargs.transA, clargs.diag, - clargs.m, clargs.n, - (cuDoubleComplex*)&(clargs.alpha), + clargs->side, clargs->uplo, clargs->transA, clargs->diag, + clargs->m, clargs->n, + (cuDoubleComplex*)&(clargs->alpha), tileA->mat, tileA->ld, tileB->mat, tileB->ld, stream ); @@ -97,29 +95,34 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, CHAMELEON_Complex64_t alpha, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - struct cl_ztrsm_args_s clargs = { - .side = side, - .uplo = uplo, - .transA = transA, - .diag = diag, - .m = m, - .n = n, - .alpha = alpha, - .tileA = A->get_blktile( A, Am, An ), - .tileB = B->get_blktile( B, Bm, Bn ), - }; + struct cl_ztrsm_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrsm"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_RW(B, Bm, Bn); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrsm_args_s ) ); + clargs->side = side; + clargs->uplo = uplo; + clargs->transA = transA; + clargs->diag = diag; + clargs->m = m; + clargs->n = n; + clargs->alpha = alpha; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrsm_callback : NULL; @@ -130,7 +133,7 @@ void INSERT_TASK_ztrsm( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrsm, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrsm_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrsm_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), STARPU_RW, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), diff --git a/runtime/starpu/codelets/codelet_ztrtri.c b/runtime/starpu/codelets/codelet_ztrtri.c index 157fa8c63..9732e84c3 100644 --- a/runtime/starpu/codelets/codelet_ztrtri.c +++ b/runtime/starpu/codelets/codelet_ztrtri.c @@ -41,17 +41,16 @@ struct cl_ztrtri_args_s { static void cl_ztrtri_cpu_func(void *descr[], void *cl_arg) { - struct cl_ztrtri_args_s clargs; + struct cl_ztrtri_args_s *clargs = (struct cl_ztrtri_args_s *)cl_arg; CHAM_tile_t *tileA; int info = 0; tileA = cti_interface_get(descr[0]); - starpu_codelet_unpack_args( cl_arg, &clargs ); - TCORE_ztrtri( clargs.uplo, clargs.diag, clargs.n, tileA, &info ); + TCORE_ztrtri( clargs->uplo, clargs->diag, clargs->n, tileA, &info ); - if ( (clargs.sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { - RUNTIME_sequence_flush( NULL, clargs.sequence, clargs.request, clargs.iinfo+info ); + if ( (clargs->sequence->status == CHAMELEON_SUCCESS) && (info != 0) ) { + RUNTIME_sequence_flush( NULL, clargs->sequence, clargs->request, clargs->iinfo+info ); } } #endif /* !defined(CHAMELEON_SIMULATION) */ @@ -66,26 +65,31 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, int iinfo ) { - struct cl_ztrtri_args_s clargs = { - .uplo = uplo, - .diag = diag, - .n = n, - .tileA = A->get_blktile( A, Am, An ), - .iinfo = iinfo, - .sequence = options->sequence, - .request = options->request, - }; + struct cl_ztrtri_args_s *clargs = NULL; void (*callback)(void*); RUNTIME_request_t *request = options->request; starpu_option_request_t *schedopt = (starpu_option_request_t *)(request->schedopt); int workerid; + int exec = 0; char *cl_name = "ztrtri"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_RW(A, Am, An); + exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; + if ( exec ) { + clargs = malloc( sizeof( struct cl_ztrtri_args_s ) ); + clargs->uplo = uplo; + clargs->diag = diag; + clargs->n = n; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->iinfo = iinfo; + clargs->sequence = options->sequence; + clargs->request = options->request; + } + /* Callback fro profiling information */ callback = options->profiling ? cl_ztrtri_callback : NULL; @@ -96,7 +100,7 @@ void INSERT_TASK_ztrtri( const RUNTIME_option_t *options, rt_starpu_insert_task( &cl_ztrtri, /* Task codelet arguments */ - STARPU_VALUE, &clargs, sizeof(struct cl_ztrtri_args_s), + STARPU_CL_ARGS, clargs, sizeof(struct cl_ztrtri_args_s), STARPU_RW, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), /* Common task arguments */ -- GitLab