diff --git a/runtime/starpu/CMakeLists.txt b/runtime/starpu/CMakeLists.txt index 4d4899569021ad8d93716f147b67a6cb129a7104..ff30f4c12ad37c2bb32072af9ab74258e7211993 100644 --- a/runtime/starpu/CMakeLists.txt +++ b/runtime/starpu/CMakeLists.txt @@ -140,6 +140,10 @@ if ( STARPU_FOUND ) if ( HAVE_STARPU_MPI_REDUX ) message("-- ${Blue}Add definition HAVE_STARPU_MPI_REDUX${ColourReset}") endif() + check_function_exists(starpu_mpi_data_cpy_priority HAVE_STARPU_MPI_DATA_CPY_PRIORITY) + if ( HAVE_STARPU_MPI_DATA_CPY_PRIORITY ) + message("-- ${Blue}Add definition HAVE_STARPU_MPI_DATA_CPY_PRIORITY${ColourReset}") + endif() endif() if (CHAMELEON_USE_CUDA AND NOT CHAMELEON_SIMULATION) diff --git a/runtime/starpu/codelets/codelet_zlacpy.c b/runtime/starpu/codelets/codelet_zlacpy.c index 1614e45af17f684366b36d359f0c30fcafe623d3..77fbd3dd213327cfd895c174b34f230038f1a858 100644 --- a/runtime/starpu/codelets/codelet_zlacpy.c +++ b/runtime/starpu/codelets/codelet_zlacpy.c @@ -39,6 +39,28 @@ struct cl_zlacpy_args_s { }; #if !defined(CHAMELEON_SIMULATION) +static void cl_zlacpy_starpu_func(void *descr[], void *cl_arg) +{ + static const struct starpu_data_interface_ops *interface_ops = &starpu_interface_cham_tile_ops; + const struct starpu_data_copy_methods *copy_methods = interface_ops->copy_methods; + struct cl_zlacpy_args_s *clargs = (struct cl_zlacpy_args_s *)cl_arg; + + int workerid = starpu_worker_get_id_check(); + unsigned memory_node = starpu_worker_get_memory_node( workerid ); + + void *src_interface = descr[0]; + void *dst_interface = descr[1]; + + int rc; + + assert( clargs->displA == 0 ); + assert( clargs->displB == 0 ); + + rc = copy_methods->any_to_any( src_interface, memory_node, + dst_interface, memory_node, NULL ); + assert( rc == 0 ); +} + static void cl_zlacpy_cpu_func(void *descr[], void *cl_arg) { @@ -75,6 +97,27 @@ cl_zlacpyx_cpu_func(void *descr[], void *cl_arg) */ CODELETS_CPU( zlacpy, cl_zlacpy_cpu_func ) CODELETS_CPU( zlacpyx, cl_zlacpyx_cpu_func ) +CODELETS( zlacpy_starpu, cl_zlacpy_starpu_func, cl_zlacpy_starpu_func, STARPU_CUDA_ASYNC ) + +static inline void +insert_task_zlacpy_on_local_node( const RUNTIME_option_t *options, + starpu_data_handle_t handleA, + starpu_data_handle_t handleB ) +{ + void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL; + starpu_data_cpy_priority( handleB, handleA, 1, callback, NULL, options->priority ); +} + +#if defined(CHAMELEON_USE_MPI) +static inline void +insert_task_zlacpy_on_remote_node( const RUNTIME_option_t *options, + starpu_data_handle_t handleA, + starpu_data_handle_t handleB ) +{ + void (*callback)(void*) = options->profiling ? cl_zlacpy_callback : NULL; + starpu_mpi_data_cpy_priority( handleB, handleA, MPI_COMM_WORLD, 1, callback, NULL, options->priority ); +} +#endif void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, cham_uplo_t uplo, int m, int n, @@ -85,11 +128,13 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, void (*callback)(void*); int exec = 0; char *cl_name = "zlacpyx"; + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; - CHAMELEON_ACCESS_R(A, Am, An); - CHAMELEON_ACCESS_W(B, Bm, Bn); + CHAMELEON_ACCESS_R( A, Am, An ); + CHAMELEON_ACCESS_W( B, Bm, Bn ); exec = __chameleon_need_exec; CHAMELEON_END_ACCESS_DECLARATION; @@ -107,23 +152,46 @@ void INSERT_TASK_zlacpyx( const RUNTIME_option_t *options, /* Callback fro profiling information */ callback = options->profiling ? cl_zlacpyx_callback : NULL; +#if !defined(CHAMELEON_USE_MPI) || defined(HAVE_STARPU_MPI_DATA_CPY_PRIORITY) /* Insert the task */ - rt_starpu_insert_task( - &cl_zlacpyx, - /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), - - /* Common task arguments */ - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, - STARPU_EXECUTE_ON_WORKER, options->workerid, + if ( (uplo == ChamUpperLower) && + (tileA->m == m) && (tileA->n == n) && + (displA == 0) && (displB == 0) ) + { +#if defined(CHAMELEON_USE_MPI) + insert_task_zlacpy_on_remote_node( options, + RTBLKADDR(A, ChamComplexDouble, Am, An), + RTBLKADDR(B, ChamComplexDouble, Bm, Bn) ); +#else + insert_task_zlacpy_on_local_node( options, + RTBLKADDR(A, ChamComplexDouble, Am, An), + RTBLKADDR(B, ChamComplexDouble, Bm, Bn) ); +#endif + } + else +#endif + { + /* Insert the task */ + rt_starpu_insert_task( + &cl_zlacpyx, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, cl_name, + STARPU_NAME, cl_name, #endif - 0 ); + 0 ); + } + + (void)tileA; + (void)tileB; } void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, @@ -135,8 +203,10 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, void (*callback)(void*); int exec = 0; char *cl_name = "zlacpy"; + CHAM_tile_t *tileA = A->get_blktile( A, Am, An ); + CHAM_tile_t *tileB = B->get_blktile( B, Bm, Bn ); - /* Handle cache */ + /* Handle cache */ CHAMELEON_BEGIN_ACCESS_DECLARATION; CHAMELEON_ACCESS_R(A, Am, An); CHAMELEON_ACCESS_W(B, Bm, Bn); @@ -150,28 +220,46 @@ void INSERT_TASK_zlacpy( const RUNTIME_option_t *options, clargs->n = n; clargs->displA = 0; clargs->displB = 0; - clargs->lda = A->get_blktile( A, Am, An )->ld; - clargs->ldb = B->get_blktile( B, Bm, Bn )->ld; + clargs->lda = tileA->ld; + clargs->ldb = tileB->ld; } /* Callback fro profiling information */ callback = options->profiling ? cl_zlacpy_callback : NULL; +#if !defined(CHAMELEON_USE_MPI) || defined(HAVE_STARPU_MPI_DATA_CPY_PRIORITY) /* Insert the task */ - rt_starpu_insert_task( - &cl_zlacpy, - /* Task codelet arguments */ - STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), - STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), - STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), - - /* Common task arguments */ - STARPU_PRIORITY, options->priority, - STARPU_CALLBACK, callback, - STARPU_EXECUTE_ON_WORKER, options->workerid, + if ( (uplo == ChamUpperLower) && + (tileA->m == m) && (tileA->n == n) ) + { +#if defined(CHAMELEON_USE_MPI) + insert_task_zlacpy_on_remote_node( options, + RTBLKADDR(A, ChamComplexDouble, Am, An), + RTBLKADDR(B, ChamComplexDouble, Bm, Bn) ); +#else + insert_task_zlacpy_on_local_node( options, + RTBLKADDR(A, ChamComplexDouble, Am, An), + RTBLKADDR(B, ChamComplexDouble, Bm, Bn) ); +#endif + } + else +#endif + { + rt_starpu_insert_task( + &cl_zlacpy, + /* Task codelet arguments */ + STARPU_CL_ARGS, clargs, sizeof(struct cl_zlacpy_args_s), + STARPU_R, RTBLKADDR(A, ChamComplexDouble, Am, An), + STARPU_W, RTBLKADDR(B, ChamComplexDouble, Bm, Bn), + + /* Common task arguments */ + STARPU_PRIORITY, options->priority, + STARPU_CALLBACK, callback, + STARPU_EXECUTE_ON_WORKER, options->workerid, #if defined(CHAMELEON_CODELETS_HAVE_NAME) - STARPU_NAME, cl_name, + STARPU_NAME, cl_name, #endif - 0 ); + 0 ); + } } diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index 4d21fe0d6e03be17be9d04bf5a3b680ee00cfc19..bd3123ab5e81f53f983c80d2889c3a88010ec7b0 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -51,6 +51,7 @@ #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER #cmakedefine HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER #cmakedefine HAVE_STARPU_MPI_REDUX +#cmakedefine HAVE_STARPU_MPI_DATA_CPY_PRIORITY #if (!defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_NODE_REGISTER) && !defined(HAVE_STARPU_MPI_INTERFACE_DATATYPE_REGISTER)) && defined(CHAMELEON_USE_MPI_DATATYPES) #error "This version of StarPU does not support MPI datatypes (Please compile with -DCHAMELEON_USE_MPI_DATATYPES=OFF)"