diff --git a/coreblas/compute/core_ztile.c b/coreblas/compute/core_ztile.c index 40af52744e74444a23d6d4505df24c61af0d5a9b..dcec74350500739b6fdbed7c38cd09f9588447e7 100644 --- a/coreblas/compute/core_ztile.c +++ b/coreblas/compute/core_ztile.c @@ -367,6 +367,17 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t CORE_zlacpy( uplo, M, N, CHAM_tile_get_ptr( A ), A->ld, CHAM_tile_get_ptr( B ), B->ld ); } +void +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +{ + assert( A->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + assert( B->format & (CHAMELEON_TILE_FULLRANK | CHAMELEON_TILE_DESC) ); + + const CHAMELEON_Complex64_t *Aptr = CHAM_tile_get_ptr( A ); + CHAMELEON_Complex64_t *Bptr = CHAM_tile_get_ptr( B ); + CORE_zlacpy( uplo, M, N, Aptr + displA, LDA, Bptr + displB, LDB ); +} + void TCORE_zlange( cham_normtype_t norm, int M, diff --git a/coreblas/compute/core_ztile_empty.c b/coreblas/compute/core_ztile_empty.c index 90bfbd3e2c5ce5838ff97b6a28fd8333d389c2c7..30347d3320ba80c657ffa39795021357a43d9b7f 100644 --- a/coreblas/compute/core_ztile_empty.c +++ b/coreblas/compute/core_ztile_empty.c @@ -263,6 +263,12 @@ TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t return; } +void +TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ) +{ + return; +} + void TCORE_zlange( cham_normtype_t norm, int M, diff --git a/coreblas/eztrace_module/coreblas_tcore_eztrace_module b/coreblas/eztrace_module/coreblas_tcore_eztrace_module index 18698b8a1949840e2eeb9cf7e6bccc8b936ed363..fd60e50fbee88238b51e97130f5907e808b79e91 100644 --- a/coreblas/eztrace_module/coreblas_tcore_eztrace_module +++ b/coreblas/eztrace_module/coreblas_tcore_eztrace_module @@ -172,6 +172,7 @@ void TCORE_zher2k( int uplo, int trans, int N, int K, void *alpha, const void *A int TCORE_zherfb( int uplo, int N, int K, int IB, int NB, const void *A, const void *T, void *C, void *WORK, int ldwork ); int TCORE_zhessq( int storev, int uplo, int N, const void *A, void *sclssq ); void TCORE_zlacpy( int uplo, int M, int N, const void *A, void *B ); +void TCORE_zlacpyx( int uplo, int M, int N, int displA, int displB, const void *A, int LDA, void *B, int LDB ); void TCORE_zlange( int norm, int M, int N, const void *A, double *work, double *normA ); void TCORE_zlanhe( int norm, int uplo, int N, const void *A, double *work, double *normA ); void TCORE_zlansy( int norm, int uplo, int N, const void *A, double *work, double *normA ); diff --git a/coreblas/include/coreblas/coreblas_ztile.h b/coreblas/include/coreblas/coreblas_ztile.h index aefd6461b4d57107a48585ef697e7a14bfb625f0..74443b88724db7a787cb93be514bdd7108372cb3 100644 --- a/coreblas/include/coreblas/coreblas_ztile.h +++ b/coreblas/include/coreblas/coreblas_ztile.h @@ -43,6 +43,7 @@ int TCORE_zherfb( cham_uplo_t uplo, int N, int K, int IB, int NB, const CHAM_ti int TCORE_zhessq( cham_store_t storev, cham_uplo_t uplo, int N, const CHAM_tile_t *A, CHAM_tile_t *sclssq ); #endif void TCORE_zlacpy( cham_uplo_t uplo, int M, int N, const CHAM_tile_t *A, CHAM_tile_t *B ); +void TCORE_zlacpyx( cham_uplo_t uplo, int M, int N, int displA, int displB, const CHAM_tile_t *A, int LDA, CHAM_tile_t *B, int LDB ); void TCORE_zlange( cham_normtype_t norm, int M, int N, const CHAM_tile_t *A, double *work, double *normA ); #if defined(PRECISION_z) || defined(PRECISION_c) void TCORE_zlanhe( cham_normtype_t norm, cham_uplo_t uplo, int N, const CHAM_tile_t *A, double *work, double *normA ); diff --git a/runtime/starpu/codelets/codelet_zcallback.c b/runtime/starpu/codelets/codelet_zcallback.c index 00fa905af5b0ad44ecf9df9b6d682e8ee711a7f5..bcca068490306ad8eb346b18ec1d6331c650089e 100644 --- a/runtime/starpu/codelets/codelet_zcallback.c +++ b/runtime/starpu/codelets/codelet_zcallback.c @@ -50,6 +50,7 @@ CHAMELEON_CL_CB(zher2k, cti_handle_get_m(task->handles[0]), cti_handle_ge CHAMELEON_CL_CB(zherk, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, ( 1.+ M)*M*N) #endif CHAMELEON_CL_CB(zlacpy, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) +CHAMELEON_CL_CB(zlacpyx, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlange, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlaset, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) CHAMELEON_CL_CB(zlaset2, cti_handle_get_m(task->handles[0]), cti_handle_get_n(task->handles[0]), 0, M*N) diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 9453453af38dd61bc1fe2b0b80a04752bac140c7..7302ec42c94f7aa69adfa5692fa2ded758b797c9 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -33,6 +33,8 @@ struct cl_zlacpy_args_s { int n; int displA; int displB; + int lda; + int ldb; CHAM_tile_t *tileA; CHAM_tile_t *tileB; }; @@ -50,17 +52,32 @@ cl_zlacpy_cpu_func(void *descr[], void *cl_arg) assert( clargs->displA == 0 ); assert( clargs->displB == 0 ); - /* A = tileA->mat; */ - /* B = tileB->mat; */ - /* CORE_zlacpy( uplo, M, N, A + displA, tileA->ld, B + displB, tileB->ld ); */ + CHAMELEON_Complex64_t *A = tileA->mat; + CHAMELEON_Complex64_t *B = tileB->mat; + // CORE_zlacpy( clargs->uplo, clargs->m, clargs->n, A + clargs->displA, tileA->ld, B + clargs->displB, tileB->ld ); TCORE_zlacpy( clargs->uplo, clargs->m, clargs->n, tileA, tileB ); } + +static void +cl_zlacpyx_cpu_func(void *descr[], void *cl_arg) +{ + struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg; + CHAM_tile_t *tileA; + CHAM_tile_t *tileB; + + tileA = cti_interface_get(descr[0]); + tileB = cti_interface_get(descr[1]); + + TCORE_zlacpyx( clargs->uplo, clargs->m, clargs->n, clargs->displA, clargs->displB, + tileA, clargs->lda, tileB, clargs->ldb ); +} #endif /* !defined(CHAMELEON_SIMULATION) */ /* * Codelet definition */ -CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) +CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) +CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func ) void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, int nb, @@ -70,7 +87,7 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, struct cl_zlacpy_args_s *clargs = NULL; void (*callback)(void*); int exec = 0; - char *cl_name = "zlacpy"; + char *cl_name = "zlacpyx"; /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; @@ -88,14 +105,16 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, clargs->displB = displB; clargs->tileA = A->get_blktile( A, Am, An ); clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->lda = clargs->tileA->ld; + clargs->ldb = clargs->tileB->ld; } /* Callback fro profiling information */ - callback = options->profiling ? cl_zlacpy_callback : NULL; + callback = options->profiling ? cl_zlacpyx_callback : NULL; /* Insert the task */ rt_starpu_insert_task( - &cl_zlacpy, + &cl_zlacpyx, /* Task codelet arguments */ STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), @@ -119,7 +138,51 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, const CHAM_desc_t *A, int Am, int An, const CHAM_desc_t *B, int Bm, int Bn ) { - INSERT_TASK_zlacpyx( options, uplo, m, n, nb, - 0, A, Am, An, - 0, B, Bm, Bn ); + struct cl_zlacpy_args_s *clargs = NULL; + void (*callback)(void*); + int exec = 0; + char *cl_name = "zlacpy"; + + /* Handle cache */ + CHAMELEON_BEGIN_ACCESS_DECLARATION; + CHAMELEON_ACCESS_R(A, Am, An); + CHAMELEON_ACCESS_W(B, Bm, Bn); + exec = __chameleon_need_exec; + CHAMELEON_END_ACCESS_DECLARATION; + + if ( exec ) { + clargs = malloc( sizeof( struct cl_zlacpy_args_s ) ); + clargs->uplo = uplo; + clargs->m = m; + clargs->n = n; + clargs->displA = 0; + clargs->displB = 0; + clargs->tileA = A->get_blktile( A, Am, An ); + clargs->tileB = B->get_blktile( B, Bm, Bn ); + clargs->lda = clargs->tileA->ld; + clargs->ldb = clargs->tileB->ld; + } + + /* Callback fro profiling information */ + callback = options->profiling ? cl_zlacpy_callback : NULL; + + /* Insert the task */ + rt_starpu_insert_task( + &cl_zlacpy, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_R, RTBLKADDR(A, CHAMELEON_Complex64_t, Am, An), + STARPU_W, RTBLKADDR(B, CHAMELEON_Complex64_t, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, +#if defined(CHAMELEON_CODELETS_HAVE_NAME) + STARPU_NAME, cl_name, +#endif + + 0 ); + + (void)nb; } diff --git a/runtime/starpu/include/runtime_codelet_z.h b/runtime/starpu/include/runtime_codelet_z.h index 013a6222f66807dbaf947dcfaed016b6f693f342..bd823f41066f1958b5396b10a7bc4589de887b23 100644 --- a/runtime/starpu/include/runtime_codelet_z.h +++ b/runtime/starpu/include/runtime_codelet_z.h @@ -94,6 +94,7 @@ CODELETS_HEADER(zhe2ge); CODELETS_HEADER(zlascal); CODELETS_HEADER(ztradd); CODELETS_HEADER(zlacpy); +CODELETS_HEADER(zlacpyx); CODELETS_HEADER(zlange); CODELETS_HEADER(zlange_max); CODELETS_HEADER(zlansy);