diff --git a/runtime/starpu/include/chameleon_starpu.h.in b/runtime/starpu/include/chameleon_starpu.h.in index af05edffec21e931325d02349f151f2a46a594ac..44d03baa53c167871669b744dd6dac105b1810b1 100644 --- a/runtime/starpu/include/chameleon_starpu.h.in +++ b/runtime/starpu/include/chameleon_starpu.h.in @@ -93,150 +93,4 @@ # endif #endif -#include "runtime_codelets.h" -#include "runtime_profiling.h" -#include "runtime_codelet_profile.h" -#include "runtime_workspace.h" - -void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, - cham_access_t access, cham_flttype_t flttype, - const CHAM_desc_t *A, int m, int n ); - -/* - * MPI Redefinitions - */ -#if defined(CHAMELEON_USE_MPI) - -#if defined(CHAMELEON_RUNTIME_SYNC) -#define rt_starpu_insert_task( _codelet_, ... ) \ - starpu_mpi_insert_task( options->sequence->comm, (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) -#else -#define rt_starpu_insert_task( _codelet_, ... ) \ - starpu_mpi_insert_task( options->sequence->comm, (_codelet_), ##__VA_ARGS__ ) -#endif - -#else - -#if defined(CHAMELEON_RUNTIME_SYNC) -#define rt_starpu_insert_task( _codelet_, ... ) \ - starpu_insert_task( (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) -#else -#define rt_starpu_insert_task( _codelet_, ... ) \ - starpu_insert_task( (_codelet_), ##__VA_ARGS__ ) -#endif - -#endif - -#if defined(CHAMELEON_RUNTIME_SYNC) -#define rt_shm_starpu_insert_task( _codelet_, ... ) \ - starpu_insert_task( (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) -#else -#define rt_shm_starpu_insert_task( _codelet_, ... ) \ - starpu_insert_task( (_codelet_), ##__VA_ARGS__ ) -#endif - -/* - * Enable codelets names - */ -#if (STARPU_MAJOR_VERSION > 1) || ((STARPU_MAJOR_VERSION == 1) && (STARPU_MINOR_VERSION > 1)) -#define CHAMELEON_CODELETS_HAVE_NAME -#endif - -/** - * MPI tag management - */ -void chameleon_starpu_tag_init( ); -int64_t chameleon_starpu_tag_book( int64_t nbtags ); -void chameleon_starpu_tag_release( int64_t min ); - -/** - * Access to block pointer and leading dimension - */ -#define RTBLKADDR( desc, type, m, n ) ( (starpu_data_handle_t)RUNTIME_data_getaddr( desc, m, n ) ) - -void RUNTIME_set_reduction_methods(starpu_data_handle_t handle, cham_flttype_t dtyp); - -#include "runtime_mpi.h" -#include "runtime_wontuse.h" - -#if defined(CHAMELEON_USE_MPI) && defined(HAVE_STARPU_MPI_CACHED_RECEIVE) -static inline int -chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n) -{ - int64_t mm = m + (A->i / A->mb); - int64_t nn = n + (A->j / A->nb); - - starpu_data_handle_t *ptrtile = A->schedopt; - ptrtile += ((int64_t)A->lmt) * nn + mm; - - if (!(*ptrtile)) { - return 0; - } - - return starpu_mpi_cached_receive(*ptrtile); -} - -#define RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) do { \ - if (chameleon_starpu_data_iscached(A, Am, An)) __chameleon_need_submit = 1; } while(0) - -#else - -#if defined(CHAMELEON_USE_MPI) -#warning "WAR dependencies need starpu_mpi_cached_receive support from StarPU 1.2.1 or greater" -#endif -#define RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) do {} while (0) - -#endif - -#ifdef CHAMELEON_ENABLE_PRUNING_STATS - -#define RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION \ - int __chameleon_exec = 0; \ - int __chameleon_changed = 0; - -#define RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An) \ - if (chameleon_desc_islocal(A, Am, An)) \ - __chameleon_exec = 1; - -#define RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION \ - RUNTIME_total_tasks++; \ - if (__chameleon_exec) \ - RUNTIME_exec_tasks++; \ - else if (__chameleon_need_submit) \ - RUNTIME_comm_tasks++; \ - else if (__chameleon_changed) \ - RUNTIME_changed_tasks++; - -#define RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) \ - int __chameleon_myrank; \ - RUNTIME_comm_rank(&__chameleon_myrank); \ - __chameleon_exec = (rank) == __chameleon_myrank; \ - __chameleon_changed = 1; \ - -#else -#define RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION -#define RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An) -#define RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION -#define RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) -#endif - -#define RUNTIME_BEGIN_ACCESS_DECLARATION \ - RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION - -#define RUNTIME_ACCESS_R(A, Am, An) - -#define RUNTIME_ACCESS_W(A, Am, An) \ - RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An); \ - RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) - -#define RUNTIME_ACCESS_RW(A, Am, An) \ - RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An); \ - RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) - -#define RUNTIME_RANK_CHANGED(rank) \ - RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) - -#define RUNTIME_END_ACCESS_DECLARATION \ - RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION; - #endif /* _chameleon_starpu_h_ */ diff --git a/runtime/starpu/include/chameleon_starpu_internal.h b/runtime/starpu/include/chameleon_starpu_internal.h index 807de66d4a78f1453dc8af419c3ad3dc1e6f7526..7ffa39bb2940f683cc75e2402753ae37abcde79b 100644 --- a/runtime/starpu/include/chameleon_starpu_internal.h +++ b/runtime/starpu/include/chameleon_starpu_internal.h @@ -59,4 +59,150 @@ static inline int cham_to_starpu_access( cham_access_t accessA ) { return accessA; } +#include "runtime_codelets.h" +#include "runtime_profiling.h" +#include "runtime_codelet_profile.h" +#include "runtime_workspace.h" + +void *RUNTIME_data_getaddr_withconversion( const RUNTIME_option_t *options, + cham_access_t access, cham_flttype_t flttype, + const CHAM_desc_t *A, int m, int n ); + +/* + * MPI Redefinitions + */ +#if defined(CHAMELEON_USE_MPI) + +#if defined(CHAMELEON_RUNTIME_SYNC) +#define rt_starpu_insert_task( _codelet_, ... ) \ + starpu_mpi_insert_task( options->sequence->comm, (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) +#else +#define rt_starpu_insert_task( _codelet_, ... ) \ + starpu_mpi_insert_task( options->sequence->comm, (_codelet_), ##__VA_ARGS__ ) +#endif + +#else + +#if defined(CHAMELEON_RUNTIME_SYNC) +#define rt_starpu_insert_task( _codelet_, ... ) \ + starpu_insert_task( (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) +#else +#define rt_starpu_insert_task( _codelet_, ... ) \ + starpu_insert_task( (_codelet_), ##__VA_ARGS__ ) +#endif + +#endif + +#if defined(CHAMELEON_RUNTIME_SYNC) +#define rt_shm_starpu_insert_task( _codelet_, ... ) \ + starpu_insert_task( (_codelet_), STARPU_TASK_SYNCHRONOUS, 1, ##__VA_ARGS__ ) +#else +#define rt_shm_starpu_insert_task( _codelet_, ... ) \ + starpu_insert_task( (_codelet_), ##__VA_ARGS__ ) +#endif + +/* + * Enable codelets names + */ +#if (STARPU_MAJOR_VERSION > 1) || ((STARPU_MAJOR_VERSION == 1) && (STARPU_MINOR_VERSION > 1)) +#define CHAMELEON_CODELETS_HAVE_NAME +#endif + +/** + * MPI tag management + */ +void chameleon_starpu_tag_init( ); +int64_t chameleon_starpu_tag_book( int64_t nbtags ); +void chameleon_starpu_tag_release( int64_t min ); + +/** + * Access to block pointer and leading dimension + */ +#define RTBLKADDR( desc, type, m, n ) ( (starpu_data_handle_t)RUNTIME_data_getaddr( desc, m, n ) ) + +void RUNTIME_set_reduction_methods(starpu_data_handle_t handle, cham_flttype_t dtyp); + +#include "runtime_mpi.h" +#include "runtime_wontuse.h" + +#if defined(CHAMELEON_USE_MPI) && defined(HAVE_STARPU_MPI_CACHED_RECEIVE) +static inline int +chameleon_starpu_data_iscached(const CHAM_desc_t *A, int m, int n) +{ + int64_t mm = m + (A->i / A->mb); + int64_t nn = n + (A->j / A->nb); + + starpu_data_handle_t *ptrtile = A->schedopt; + ptrtile += ((int64_t)A->lmt) * nn + mm; + + if (!(*ptrtile)) { + return 0; + } + + return starpu_mpi_cached_receive(*ptrtile); +} + +#define RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) do { \ + if (chameleon_starpu_data_iscached(A, Am, An)) __chameleon_need_submit = 1; } while(0) + +#else + +#if defined(CHAMELEON_USE_MPI) +#warning "WAR dependencies need starpu_mpi_cached_receive support from StarPU 1.2.1 or greater" +#endif +#define RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) do {} while (0) + +#endif + +#ifdef CHAMELEON_ENABLE_PRUNING_STATS + +#define RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION \ + int __chameleon_exec = 0; \ + int __chameleon_changed = 0; + +#define RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An) \ + if (chameleon_desc_islocal(A, Am, An)) \ + __chameleon_exec = 1; + +#define RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION \ + RUNTIME_total_tasks++; \ + if (__chameleon_exec) \ + RUNTIME_exec_tasks++; \ + else if (__chameleon_need_submit) \ + RUNTIME_comm_tasks++; \ + else if (__chameleon_changed) \ + RUNTIME_changed_tasks++; + +#define RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) \ + int __chameleon_myrank; \ + RUNTIME_comm_rank(&__chameleon_myrank); \ + __chameleon_exec = (rank) == __chameleon_myrank; \ + __chameleon_changed = 1; \ + +#else +#define RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION +#define RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An) +#define RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION +#define RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) +#endif + +#define RUNTIME_BEGIN_ACCESS_DECLARATION \ + RUNTIME_PRUNING_STATS_BEGIN_ACCESS_DECLARATION + +#define RUNTIME_ACCESS_R(A, Am, An) + +#define RUNTIME_ACCESS_W(A, Am, An) \ + RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An); \ + RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) + +#define RUNTIME_ACCESS_RW(A, Am, An) \ + RUNTIME_PRUNING_STATS_ACCESS_W(A, Am, An); \ + RUNTIME_ACCESS_WRITE_CACHED(A, Am, An) + +#define RUNTIME_RANK_CHANGED(rank) \ + RUNTIME_PRUNING_STATS_RANK_CHANGED(rank) + +#define RUNTIME_END_ACCESS_DECLARATION \ + RUNTIME_PRUNING_STATS_END_ACCESS_DECLARATION; + #endif /* _chameleon_starpu_internal_h_ */