diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt index 0752216668610bf1d0987c771e821fe372e01324..f4a1e230a900c6c2d471685dd0b26e779a0f4104 100644 --- a/runtime/CMakeLists.txt +++ b/runtime/CMakeLists.txt @@ -352,11 +352,14 @@ endif() set(LIBOMP_USE_THEQUEUE FALSE CACHE BOOL "libOMP based on T.H.E. work stealing protocol.") +set(LIBOMP_USE_LINKED_DEQUEUE FALSE CACHE BOOL + "libOMP based on double linked queue.") + set(LIBOMP_USE_THE_AGGREGATION FALSE CACHE BOOL "libOMP based on T.H.E. work stealing protocol extended by requests' aggregation.") -if(LIBOMP_USE_THE_AGGREGATION AND (NOT LIBOMP_USE_THEQUEUE)) - set(LIBOMP_USE_THEQUEUE TRUE) +if(LIBOMP_USE_THE_AGGREGATION AND (NOT LIBOMP_USE_THEQUEUE AND NOT LIBOMP_USE_LINKED_DEQUEUE)) + libomp_error_say("OpenMP aggregation protocol requires either LIBOMP_USE_THEQUEUE or LIBOMP_USE_LINKED_DEQUEUE but they are not defined") endif() set(LIBOMP_USE_CONCURRENT_WRITE FALSE CACHE BOOL @@ -379,13 +382,13 @@ else() endif() set(LIBOMP_USE_EXTSCHED_MEM FALSE CACHE BOOL - "libOMP experimental memory scheduling.") + "libOMP experimental memory scheduling - experimental feature.") if(LIBOMP_USE_EXTSCHED_MEM AND (NOT LIBOMP_USE_NUMA)) libomp_error_say("OpenMP memory scheduling extension requires LIBOMP_USE_NUMA but it is not available") endif() -if(LIBOMP_USE_EXTSCHED_MEM AND (NOT LIBOMP_USE_THEQUEUE)) - libomp_error_say("OpenMP memory scheduling extension requires LIBOMP_USE_THEQUEUE but it is not available") +if(LIBOMP_USE_EXTSCHED_MEM AND (NOT LIBOMP_USE_THEQUEUE AND NOT LIBOMP_USE_LINKED_DEQUEUE)) + libomp_error_say("OpenMP memory scheduling extension requires either LIBOMP_USE_THEQUEUE or LIBOMP_USE_LINKED_DEQUEUE but thery are not defined") endif() if(LIBOMP_USE_EXTSCHED_MEM) @@ -394,6 +397,19 @@ else() set(OMP_EXTENSION_SCHED_MEMORY 0) endif() +set(LIBOMP_USE_REORDER4LOCALITY FALSE CACHE BOOL + "libOMP reorder tasks' list for better locality - experimental feature.") + +if(LIBOMP_USE_REORDER4LOCALITY AND (NOT LIBOMP_USE_LINKED_DEQUEUE)) + libomp_error_say("OpenMP locality scheduling extension requires LIBOMP_USE_LINKED_DEQUEUE but it is not defined") +endif() + +if(LIBOMP_USE_REORDER4LOCALITY) + set(OMP_EXTENSION_SCHED_LOCALITY 1<<6) +else() + set(OMP_EXTENSION_SCHED_LOCALITY 0) +endif() + set(LIBOMP_USE_PAPI FALSE CACHE BOOL "libOMP tracing based on PAPI") @@ -469,8 +485,10 @@ if(${LIBOMP_STANDALONE_BUILD}) libomp_say("Use OMPT-trace -- ${LIBOMP_OMPT_TRACE}") endif() libomp_say("Use T.H.E. protocol -- ${LIBOMP_USE_THEQUEUE}") + libomp_say("Use linked dqueue -- ${LIBOMP_USE_LINKED_DEQUEUE}") libomp_say("Use request combining-- ${LIBOMP_USE_THE_AGGREGATION}") libomp_say("Use memory scheduler -- ${LIBOMP_USE_EXTSCHED_MEM}") + libomp_say("Use locality scheduler-- ${LIBOMP_USE_REORDER4LOCALITY}") libomp_say("Use concurrent write -- ${LIBOMP_USE_CONCURRENT_WRITE}") libomp_say("Use dyn. hashmap. -- ${LIBOMP_USE_DYNHASH}") libomp_say("Use var. length dep. -- ${LIBOMP_USE_VARDEP}") diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt index c2027c9d73e38a3c43418ca1a4223a3c4c90ad39..705568ec8fcfe13cb7b6b4a54d9db2ff7f90167d 100644 --- a/runtime/src/CMakeLists.txt +++ b/runtime/src/CMakeLists.txt @@ -90,6 +90,7 @@ else() kmp_wait_release.cpp kmp_affinity.cpp kmp_hws.cpp + kmp_taskreschedule.cpp kmp_queues.cpp kmp_dispatch.cpp kmp_lock.cpp @@ -114,7 +115,7 @@ else() libomp_append(LIBOMP_CXXFILES kmp_taskdeps.cpp) libomp_append(LIBOMP_CXXFILES kmp_cancel.cpp) endif() - if (${LIBOMP_USE_THEQUEUE}) + if (${LIBOMP_USE_THEQUEUE} OR ${LIBOMP_USE_LINKED_DEQUEUE}) libomp_append(LIBOMP_CFILES kaapi_sched_ccsync.c) libomp_append(LIBOMP_CFILES kaapi_rt.c) endif() diff --git a/runtime/src/dllexports b/runtime/src/dllexports index 26452e71f18611e82dd6180ba6f45df6ec2c9428..31cb5bb1c22200d832ceacb1103c2478e1c387fe 100644 --- a/runtime/src/dllexports +++ b/runtime/src/dllexports @@ -302,6 +302,8 @@ __kmpc_omp_info_free_memory 608 __kmpc_omp_set_memory_limit 609 __kmpc_omp_set_task_attr 610 + __kmpc_omp_begin_sched_graph 611 + __kmpc_omp_end_sched_graph 612 __kmpc_omp_taskwait 193 __kmpc_omp_task_begin_if0 196 __kmpc_omp_task_complete_if0 197 @@ -519,6 +521,8 @@ kmp_set_warnings_off 780 omp_task_declare_dependencies_array 4005 omp_task_declare_dependencies_array_noalias 4006 omp_set_task_attr 4007 + omp_begin_sched_graph 4008 + omp_end_sched_graph 4009 omp_get_num_teams 865 omp_get_team_num 866 diff --git a/runtime/src/include/40/omp.h.var b/runtime/src/include/40/omp.h.var index 451e6c52c4c24cdd9de948077495dbeefce4e2e4..a8257367bc14178af8e5aeed910f49c6155b2e0c 100644 --- a/runtime/src/include/40/omp.h.var +++ b/runtime/src/include/40/omp.h.var @@ -26,12 +26,14 @@ # define OMP_EXTENSION_CONCURRENT_WRITE (@OMP_EXTENSION_CW@) # define OMP_VARLEN_DEPENDENCIES (@OMP_VARLEN_DEPENDENCIES@) # define OMP_EXTENSION_SCHED_MEMORY (@OMP_EXTENSION_SCHED_MEMORY@) +# define OMP_EXTENSION_SCHED_LOCALITY (@OMP_EXTENSION_SCHED_LOCALITY@) # define KMP_EXTENSION ( OMP_EXTENSION_AFFINITY\ |OMP_EXTENSION_TASKNAME\ |OMP_EXTENSION_CONCURRENT_WRITE\ |OMP_VARLEN_DEPENDENCIES\ |OMP_EXTENSION_SCHED_MEMORY\ + |OMP_EXTENSION_SCHED_LOCALITY\ ) # define OMP_EXTENSION KMP_EXTENSION @@ -120,6 +122,10 @@ extern void __KAI_KMPC_CONVENTION omp_set_task_name (char *); extern void __KAI_KMPC_CONVENTION omp_set_task_attr (char, long int); #endif +#if OMP_EXTENSION_SCHED_LOCALITY + extern void* __KAI_KMPC_CONVENTION omp_begin_sched_graph (int); + extern void __KAI_KMPC_CONVENTION omp_end_sched_graph (void*, int); +#endif #if OMP_EXTENSION_CONCURRENT_WRITE typedef enum omp_depend_info_t { omp_depend_info_none = 0, diff --git a/runtime/src/include/45/omp.h.var b/runtime/src/include/45/omp.h.var index 0467d06a1366f864d3141601b2f88ef928c4eed2..8448821d9038fbae81d7557c74e3b657c4368483 100644 --- a/runtime/src/include/45/omp.h.var +++ b/runtime/src/include/45/omp.h.var @@ -26,12 +26,14 @@ # define OMP_EXTENSION_CONCURRENT_WRITE (@OMP_EXTENSION_CW@) # define OMP_VARLEN_DEPENDENCIES (@OMP_VARLEN_DEPENDENCIES@) # define OMP_EXTENSION_SCHED_MEMORY (@OMP_EXTENSION_SCHED_MEMORY@) +# define OMP_EXTENSION_SCHED_LOCALITY (@OMP_EXTENSION_SCHED_LOCALITY@) # define KMP_EXTENSION ( OMP_EXTENSION_AFFINITY\ |OMP_EXTENSION_TASKNAME\ |OMP_EXTENSION_CONCURRENT_WRITE\ |OMP_VARLEN_DEPENDENCIES\ |OMP_EXTENSION_SCHED_MEMORY\ + |OMP_EXTENSION_SCHED_LOCALITY\ ) # define OMP_EXTENSION KMP_EXTENSION @@ -138,6 +140,10 @@ extern void __KAI_KMPC_CONVENTION omp_set_task_name (char *); extern void __KAI_KMPC_CONVENTION omp_set_task_attr (char, long int); #endif +#if OMP_EXTENSION_SCHED_LOCALITY + extern void* __KAI_KMPC_CONVENTION omp_begin_sched_graph (int); + extern void __KAI_KMPC_CONVENTION omp_end_sched_graph (void*, int); +#endif #if OMP_EXTENSION_CONCURRENT_WRITE typedef enum omp_depend_info_t { omp_depend_info_none = 0, diff --git a/runtime/src/include/50/omp.h.var b/runtime/src/include/50/omp.h.var index 727a088c686a3ae1b3ff21a375d524f646f96665..9d9f0bb02a13a5a86654af8f864524e8269cc0e1 100644 --- a/runtime/src/include/50/omp.h.var +++ b/runtime/src/include/50/omp.h.var @@ -26,12 +26,14 @@ # define OMP_EXTENSION_CONCURRENT_WRITE (@OMP_EXTENSION_CW@) # define OMP_VARLEN_DEPENDENCIES (@OMP_VARLEN_DEPENDENCIES@) # define OMP_EXTENSION_SCHED_MEMORY (@OMP_EXTENSION_SCHED_MEMORY@) +# define OMP_EXTENSION_SCHED_LOCALITY (@OMP_EXTENSION_SCHED_LOCALITY@) # define KMP_EXTENSION ( OMP_EXTENSION_AFFINITY\ |OMP_EXTENSION_TASKNAME\ |OMP_EXTENSION_CONCURRENT_WRITE\ |OMP_VARLEN_DEPENDENCIES\ |OMP_EXTENSION_SCHED_MEMORY\ + |OMP_EXTENSION_SCHED_LOCALITY\ ) # define OMP_EXTENSION KMP_EXTENSION @@ -138,6 +140,10 @@ extern void __KAI_KMPC_CONVENTION omp_set_task_name (char *); extern void __KAI_KMPC_CONVENTION omp_set_task_attr (char, long int); #endif +#if OMP_EXTENSION_SCHED_LOCALITY + extern void* __KAI_KMPC_CONVENTION omp_begin_sched_graph (int); + extern void __KAI_KMPC_CONVENTION omp_end_sched_graph (void*, int); +#endif #if OMP_EXTENSION_CONCURRENT_WRITE typedef enum omp_depend_info_t { omp_depend_info_none = 0, diff --git a/runtime/src/kaapi_sched_ccsync.c b/runtime/src/kaapi_sched_ccsync.c index 8024ef666f546fcb13871cb4dacece1014ab25e0..5790537d90d8666d832a74709bd4753af6806ca3 100644 --- a/runtime/src/kaapi_sched_ccsync.c +++ b/runtime/src/kaapi_sched_ccsync.c @@ -54,11 +54,6 @@ #endif -/* FWD -*/ -static inline int __kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ); -static inline kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ); -static inline kaapi_task_t* __kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ); /* ============================= REQUEST ============================ */ @@ -156,6 +151,7 @@ typedef struct kaapi_push_request_t { } kaapi_push_request_t; +#if LIBOMP_USE_LINKED_DEQUEUE /** \ingroup WS Arg for push request */ @@ -165,10 +161,10 @@ typedef struct kaapi_pushlist_request_t { int ident; /* system wide id who is emetting the request */ int mask_arch; /* accepted arch */ int status; /* request status */ - kaapi_task_t** list; /* to push */ + kaapi_wsqueue_t* list; /* linked list to push */ int size; /* size of list */ } kaapi_pushlist_request_t; - +#endif /** \ingroup WS Request emitted to get work. @@ -180,7 +176,9 @@ typedef union kaapi_request_t { kaapi_steal_request_t steal_a; kaapi_pop_request_t pop_a; kaapi_push_request_t push_a; +#if LIBOMP_USE_LINKED_DEQUEUE kaapi_pushlist_request_t push_l; +#endif } kaapi_request_t; @@ -387,6 +385,15 @@ int kaapi_listrequest_ccsync_iterator_count( /* no concurrency here: always called before starting threads */ int kaapi_wsqueue_init(kaapi_wsqueue_t* queue, size_t size, int numa_node) { +#if LIBOMP_USE_LINKED_DEQUEUE +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_init_bootstrap_lock( &queue->deque_lock ); +#endif + queue->deque_H = 0; + queue->deque_T = 0; + queue->deque_size = 0; + queue->numa_node = numa_node; +#else // LIBOMP_USE_LINKED_DEQUEUE #if !LIBOMP_USE_THE_AGGREGATION __kmp_init_bootstrap_lock( &queue->deque_lock_owner ); __kmp_init_bootstrap_lock( &queue->deque_lock ); @@ -404,6 +411,7 @@ int kaapi_wsqueue_init(kaapi_wsqueue_t* queue, size_t size, int numa_node) __kmp_allocate( size * sizeof(kaapi_task_t *)); #endif queue->deque_size = (uint32_t)size; +#endif // LIBOMP_USE_LINKED_DEQUEUE #if LIBOMP_USE_THE_AGGREGATION /* first non blocked node ! */ @@ -425,6 +433,11 @@ int kaapi_wsqueue_init(kaapi_wsqueue_t* queue, size_t size, int numa_node) /* no concurrency here: always called before starting threads */ int kaapi_wsqueue_realloc(kaapi_wsqueue_t* queue ) { +#if LIBOMP_USE_LINKED_DEQUEUE + // never realloc queue + return 0; +#else // #if LIBOMP_USE_LINKED_DEQUEUE + kmp_int32 size = queue->deque_size; kmp_int32 new_size = 2 * size; kaapi_task_t ** new_deque; @@ -455,6 +468,7 @@ int kaapi_wsqueue_realloc(kaapi_wsqueue_t* queue ) #endif queue->deque = new_deque; queue->deque_size = new_size; +#endif return 0; } @@ -462,6 +476,9 @@ int kaapi_wsqueue_realloc(kaapi_wsqueue_t* queue ) /* */ static int kaapi_wsqueue_realloc_push_remote(kaapi_wsqueue_t* queue ) { +#if LIBOMP_USE_LINKED_DEQUEUE + return 0; +#else // #if LIBOMP_USE_LINKED_DEQUEUE kmp_int32 size = queue->deque_size; kmp_int32 new_size = 2 * size; kaapi_task_t ** new_deque; @@ -488,23 +505,30 @@ static int kaapi_wsqueue_realloc_push_remote(kaapi_wsqueue_t* queue ) queue->deque_T += shift; queue->deque = new_deque; queue->deque_size = new_size; +#endif return 0; } -/* no concurrency here: always called before starting threads +/* */ int kaapi_wsqueue_fini(kaapi_wsqueue_t* queue) { #if !LIBOMP_USE_THE_AGGREGATION __kmp_acquire_bootstrap_lock( &queue->deque_lock ); #endif + KMP_DEBUG_ASSERT(queue->deque_H == 0); + KMP_DEBUG_ASSERT(queue->deque_H == 0); + +#if LIBOMP_USE_LINKED_DEQUEUE +#else // LIBOMP_USE_LINKED_DEQUEUE #if LIBOMP_USE_AFFINITY numa_free((void*)queue->deque, queue->deque_size ); #else if (queue->deque) __kmp_free((void*)queue->deque); #endif queue->deque = 0; +#endif #if LIBOMP_USE_THE_AGGREGATION if (queue->tail) { @@ -515,8 +539,7 @@ int kaapi_wsqueue_fini(kaapi_wsqueue_t* queue) #endif queue->tail = 0; } -#endif -#if !LIBOMP_USE_THE_AGGREGATION +#else // !LIBOMP_USE_THE_AGGREGATION __kmp_release_bootstrap_lock( &queue->deque_lock ); #endif return 0; @@ -643,10 +666,10 @@ int kaapi_sched_ccsync_commit_request( kaapi_wsqueue_t* queue, kaapi_request_t* tmpNode->wait = 0; } break; -#if 0 +#if LIBOMP_USE_LINKED_DEQUEUE case KAAPI_REQUEST_OP_PUSHLIST: { - if (0 == ld->vtable->fs_pushlist(ld, req->push_l.list)) + if (0 == __kaapi_wsqueue_push_tasklist(queue, req->push_l.list)) req->header.status = KAAPI_REQUEST_S_OK; else req->header.status = KAAPI_REQUEST_S_NOK; @@ -840,10 +863,38 @@ int kaapi_sched_ccsync_pgo_fini( kaapi_place_group_operation_t* kpgo ) /* Assume 1 owner and multiple thieves */ -static inline int __kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ) +int __kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ) { int err; KMP_DEBUG_ASSERT( task != 0); +#if LIBOMP_USE_LINKED_DEQUEUE +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_acquire_bootstrap_lock( & queue->deque_lock ); +#endif + /* push to head */ + task->prev = 0; + task->next = queue->deque_H; + if (task->next ==0) + queue->deque_T = task; + else + queue->deque_H->prev = task; + queue->deque_H = task; +#if 0 + /* push to tail */ + task->next = 0 + task->prev = queue->deque_T; + if (task->prev ==0) + queue->deque_H = task; + else + queue->deque_T->next = task; + queue->deque_T = task; +#endif + ++queue->deque_size; +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_release_bootstrap_lock( & queue->deque_lock ); +#endif + return 0; +#else // LIBOMP_USE_LINKED_DEQUEUE if (!remote) { reread: @@ -898,22 +949,52 @@ redo_read: __kmp_release_bootstrap_lock( & queue->deque_lock ); #endif } +#endif // LIBOMP_USE_LINKED_DEQUEUE return err; } +#if LIBOMP_USE_LINKED_DEQUEUE +int __kaapi_wsqueue_push_tasklist(kaapi_wsqueue_t* queue, kaapi_wsqueue_t* list ) +{ + KMP_DEBUG_ASSERT( list != 0); + if (kaapi_wsqueue_empty( list )) return 0; +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_acquire_bootstrap_lock( & queue->deque_lock ); +#endif + /* push to tail */ + list->deque_T->next = 0; + list->deque_H->prev = queue->deque_T; + if (list->deque_H->prev ==0) + queue->deque_H = list->deque_H; + else + queue->deque_T->next = list->deque_H; + queue->deque_T = list->deque_T; + queue->deque_size += list->deque_size;; + + list->deque_H = list->deque_T = 0; +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_release_bootstrap_lock( & queue->deque_lock ); +#endif + return 0; +} +#endif // LIBOMP_USE_LINKED_DEQUEUE + /* Assume N owners and multiple thieves */ static inline int __kaapi_wsqueue_locked_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ) { int retval; - +#if LIBOMP_USE_LINKED_DEQUEUE // always locked + retval = __kaapi_wsqueue_push_task( queue, remote, task ); +#else #if !LIBOMP_USE_THE_AGGREGATION __kmp_acquire_bootstrap_lock( &queue->deque_lock_owner ); #endif retval = __kaapi_wsqueue_push_task( queue, remote, task ); #if !LIBOMP_USE_THE_AGGREGATION __kmp_release_bootstrap_lock( &queue->deque_lock_owner ); +#endif #endif return retval; } @@ -922,9 +1003,32 @@ static inline int __kaapi_wsqueue_locked_push_task(kaapi_wsqueue_t* queue, int r /* */ -static inline kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ) +kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ) { - kaapi_task_t* task; + kaapi_task_t* task =0; + +#if LIBOMP_USE_LINKED_DEQUEUE +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_acquire_bootstrap_lock( & queue->deque_lock ); +#endif + /* pop from Head */ + if (queue->deque_H) + { + task = queue->deque_H; + queue->deque_H = task->next; + if (queue->deque_H ==0) + queue->deque_T = 0; + else + task->next->prev = 0; + + task->prev = task->next = 0; + } + --queue->deque_size; +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_release_bootstrap_lock( &queue->deque_lock ); +#endif + return task; +#else // LIBOMP_USE_LINKED_DEQUEUE uint32_t deque_tail = queue->deque_T; if (queue->deque_H >= deque_tail) @@ -960,6 +1064,7 @@ static inline kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ) return task; } return queue->deque[ new_tail ]; +#endif } @@ -968,13 +1073,16 @@ static inline kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ) static inline kaapi_task_t* __kaapi_wsqueue_locked_pop_task( kaapi_wsqueue_t* queue ) { kaapi_task_t* task; - +#if LIBOMP_USE_LINKED_DEQUEUE + task = __kaapi_wsqueue_pop_task( queue ); +#else #if !LIBOMP_USE_THE_AGGREGATION __kmp_acquire_bootstrap_lock( &queue->deque_lock_owner ); #endif task = __kaapi_wsqueue_pop_task( queue ); #if !LIBOMP_USE_THE_AGGREGATION __kmp_release_bootstrap_lock( &queue->deque_lock_owner ); +#endif #endif return task; } @@ -982,21 +1090,36 @@ static inline kaapi_task_t* __kaapi_wsqueue_locked_pop_task( kaapi_wsqueue_t* qu /* */ -static inline kaapi_task_t* __kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ) +kaapi_task_t* __kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ) { - kaapi_task_t* task; + kaapi_task_t* task =0; +#if LIBOMP_USE_LINKED_DEQUEUE + return __kaapi_wsqueue_pop_task(queue); +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_acquire_bootstrap_lock( & queue->deque_lock ); +#endif + /* steal from tail */ + if (queue->deque_T) + { + task = queue->deque_T; + queue->deque_T = task->prev; + if (queue->deque_T ==0) + queue->deque_H = 0; + else + task->prev->next = 0; -#if 0 - uint32_t deque_head = queue->deque_H; - uint32_t deque_tail = queue->deque_T; - if (deque_head >= deque_tail) - return 0; -#else - uint32_t deque_head; - uint32_t deque_tail; + task->prev = task->next = 0; + --queue->deque_size; + } +#if !LIBOMP_USE_THE_AGGREGATION + __kmp_release_bootstrap_lock( & queue->deque_lock ); #endif + return task; - task = 0; +#else // LIBOMP_USE_LINKED_DEQUEUE + + uint32_t deque_head; + uint32_t deque_tail; /* Thief lock */ #if !LIBOMP_USE_THE_AGGREGATION @@ -1018,6 +1141,7 @@ static inline kaapi_task_t* __kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ) /* Thief unlock */ __kmp_release_bootstrap_lock( & queue->deque_lock ); #endif +#endif // LIBOMP_USE_LINKED_DEQUEUE return task; } @@ -1044,6 +1168,23 @@ int kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* ta #endif } +#if LIBOMP_USE_LINKED_DEQUEUE +extern int kaapi_wsqueue_push_tasklist(kaapi_wsqueue_t* queue, kaapi_wsqueue_t* list ) +{ +#if LIBOMP_USE_THE_AGGREGATION + kaapi_request_t* request = kaapi_sched_ccsync_post_request(queue); + request->header.ident = 0; /* queue->kid; */ + request->header.op = KAAPI_REQUEST_OP_PUSHLIST; + request->push_l.list = list; + + kaapi_sched_ccsync_commit_request(queue, request); + + return kaapi_request_get_status(request) == KAAPI_REQUEST_S_OK ? 0 : EINVAL; +#else + return __kaapi_wsqueue_push_tasklist( queue, list ); +#endif +} +#endif /* Client */ @@ -1125,3 +1266,49 @@ kaapi_task_t* kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ) return __kaapi_wsqueue_steal_task(queue); #endif } + + +// Insert the sublist [itBegin .. itEnd] after itInsert in queue +void kaapi_wsqueue_splice( + kaapi_wsqueue_t* queue, + kaapi_task_t* itInsert, + kaapi_task_t* itBegin, + kaapi_task_t* itEnd ) +{ +#if LIBOMP_USE_LINKED_DEQUEUE + kaapi_task_t* nextItinsert = itInsert->next; + kaapi_task_t* prevItBegin = itBegin->prev; + kaapi_task_t* nextItEnd = itEnd->next; + +#ifndef NDEBUG + // Note: itBegin == itEnd is supported + KMP_DEBUG_ASSERT(itInsert != itBegin); // Not supported yet + KMP_DEBUG_ASSERT(itInsert != itEnd); // Not supported yet + for(kaapi_task_t* it=itBegin ; it!=itEnd ; it=it->next) + { + KMP_DEBUG_ASSERT(it != NULL); // Corruption: NULL reached before itEnd + KMP_DEBUG_ASSERT(it != itInsert); // Corruption: insert detected + } +#endif + + // slice removal + if(prevItBegin != 0) + prevItBegin->next = nextItEnd; + else + queue->deque_H = nextItEnd; + if(nextItEnd != 0) + nextItEnd->prev = prevItBegin; + else + queue->deque_T = prevItBegin; + + // slice insertion + itInsert->next = itBegin; + itBegin->prev = itInsert; + if(nextItinsert != 0) + nextItinsert->prev = itEnd; + else + queue->deque_T = itEnd; + + itEnd->next = nextItinsert; +#endif +} diff --git a/runtime/src/kaapi_wsprotocol.h b/runtime/src/kaapi_wsprotocol.h index 1c248c66faf2f19e76c08a9494b16ca5159bd1c3..13b06980651d9a544f354d8d23702f698e1b5c33 100644 --- a/runtime/src/kaapi_wsprotocol.h +++ b/runtime/src/kaapi_wsprotocol.h @@ -46,8 +46,6 @@ #ifndef _KAAPI_WSPROTOCOL_H #define _KAAPI_WSPROTOCOL_H 1 -#include "kmp.h" - /* FWD */ struct kaapi_request_node_t; @@ -63,6 +61,17 @@ typedef struct kmp_taskdata kaapi_task_t; classical T.H.E algorithm */ typedef struct kaapi_wsqueue_t { +#if LIBOMP_USE_LINKED_DEQUEUE +#if !LIBOMP_USE_THE_AGGREGATION + kmp_bootstrap_lock_t deque_lock; // Lock for accessing deque +#endif + kaapi_task_t * deque_H; + kaapi_task_t * deque_T; + kmp_int32 deque_size; // Number of tasks in deque + int numa_node; // Prefered numa node or -1 if unspecified + +#else // !LIBOMP_USE_LINKED_DEQUEUE + volatile unsigned int deque_H; // Head of deque: steal by the thief #if !LIBOMP_USE_THE_AGGREGATION kmp_bootstrap_lock_t deque_lock; // Lock for accessing deque: always taken by the thieves @@ -74,6 +83,7 @@ typedef struct kaapi_wsqueue_t { int numa_node; // Prefered numa node or -1 if unspecified kaapi_task_t** deque; // Deque of tasks encountered by td_thr, dynamically allocated unsigned int deque_size; // Size of deck +#endif #if LIBOMP_USE_THE_AGGREGATION struct kaapi_request_node_t* tail; #endif @@ -83,19 +93,40 @@ typedef struct kaapi_wsqueue_t { /* */ static inline int kaapi_wsqueue_empty( kaapi_wsqueue_t* queue ) -{ return queue->deque_H >= queue->deque_T; } +{ +#if LIBOMP_USE_LINKED_DEQUEUE + return queue->deque_H == 0; +#else + return queue->deque_H >= queue->deque_T; +#endif +} + extern int kaapi_wsqueue_init(kaapi_wsqueue_t* queue, size_t size, int numa_node ); extern int kaapi_wsqueue_fini(kaapi_wsqueue_t* queue); extern int kaapi_wsqueue_realloc(kaapi_wsqueue_t* queue ); /* push If remote = 0, must be called by the owner of the queue only (one thread) + If remote = 1, the task is pushed such that the next steal request will get it. Return 0 in case of success else return ENOMEM if it cannot push */ extern int kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ); +/* same as kaapi_wsqueue_push_task - assume no concurrency +*/ +extern int __kaapi_wsqueue_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ); + +/* push list of task + Return 0 in case of success else return ENOMEM if it cannot push +*/ +#if LIBOMP_USE_LINKED_DEQUEUE +extern int kaapi_wsqueue_push_tasklist(kaapi_wsqueue_t* dest, kaapi_wsqueue_t* src ); +extern int __kaapi_wsqueue_push_tasklist(kaapi_wsqueue_t* queue, kaapi_wsqueue_t* list ); +#endif + /* push - Serialize owner before pushing task. + Serialize owner(s) before pushing task. + If remote = 1, the task is pushed such that the next steal request will get it. Return 0 in case of success else return ENOMEM if it cannot push */ extern int kaapi_wsqueue_locked_push_task(kaapi_wsqueue_t* queue, int remote, kaapi_task_t* task ); @@ -106,8 +137,13 @@ extern int kaapi_wsqueue_locked_push_task(kaapi_wsqueue_t* queue, int remote, ka */ extern kaapi_task_t* kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ); +/* same as kaapi_wsqueue_pop_task - assume no concurrency +*/ +extern kaapi_task_t* __kaapi_wsqueue_pop_task( kaapi_wsqueue_t* queue ); + + /* pop - May be called by any threads that consider to be owner of the queue. + May be called by owner threads of the queue. Return 0 in case of failure */ extern kaapi_task_t* kaapi_wsqueue_locked_pop_task( kaapi_wsqueue_t* queue ); @@ -118,5 +154,21 @@ extern kaapi_task_t* kaapi_wsqueue_locked_pop_task( kaapi_wsqueue_t* queue ); */ extern kaapi_task_t* kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ); +/* same as kaapi_wsqueue_steal_task - assume no concurrency +*/ +extern kaapi_task_t* __kaapi_wsqueue_steal_task( kaapi_wsqueue_t* queue ); + + +/* Similar to std::list::splice + Move tasks between [itBegin,itEnd] just after the position itInsert. + Currently not yet accessible through the aggregation protocol and could only be used on + non sharing queue. +*/ +extern void kaapi_wsqueue_splice( + kaapi_wsqueue_t* ready_queue, + kaapi_task_t* itInsert, + kaapi_task_t* itBegin, + kaapi_task_t* itEnd ); + #endif diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h index 1b00c0bfd20d3712371f148cb3c85480f1316d94..259bfe68d616f0bd27be99e54c242441db8b9f8f 100644 --- a/runtime/src/kmp.h +++ b/runtime/src/kmp.h @@ -136,7 +136,7 @@ class kmp_stats_list; #include "ompt-internal.h" #endif -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE #ifdef __cplusplus extern "C" { #endif @@ -2397,6 +2397,17 @@ struct kmp_taskdata { /* aligned during dynamic allocation */ kmp_task_team_t *td_task_team; kmp_int32 td_size_alloc; // The size of task structure, including shareds etc. #endif +#if LIBOMP_USE_LINKED_DEQUEUE + struct kmp_taskdata* next; + struct kmp_taskdata* prev; +#endif +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + // Only useful during the construction of the graph + // Invariant: the range contains only predecessor of this task (todo: check) + struct kmp_taskdata* groupBegin; // first element of the range + struct kmp_taskdata* groupEnd; // last element of the range +#endif + #if OMPT_SUPPORT ompt_task_info_t ompt_task_info; #endif @@ -2407,9 +2418,9 @@ KMP_BUILD_ASSERT(sizeof(kmp_taskdata_t) % sizeof(void *) == 0); // Data for a hierarchy entity typedef struct kmp_base_queue_data { -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE kaapi_wsqueue_t td_wsdeque; // From Kaapi workqueue -#else +#else /* original libOMP queue */ kmp_bootstrap_lock_t td_deque_lock; // Lock for accessing deque kmp_taskdata_t ** td_deque; // Deque of tasks encountered by td_thr, dynamically allocated kmp_int32 td_deque_size; // Size of deck @@ -2644,6 +2655,11 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info { kmp_uint32 th_reap_state; // Non-zero indicates thread is not // tasking, thus safe to reap +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + // private list of tasks used between call to begin_graph - end_graph to store pushed & ready task + // the queue is made visible a end_graph and is not stolen until the next call to this function + kaapi_wsqueue_t* th_tasklist; +#endif #if LIBOMP_USE_VARDEP && OMP_40_ENABLED kmp_uint32 th_edps_size[2]; // cummulative count in th_edeps kmp_extra_depinfo_th_t th_edeps[2]; // [alias/noalias] @@ -3870,6 +3886,8 @@ KMP_EXPORT void __kmpc_omp_set_task_affinity(kmp_uint32 kind, kmp_uint64 affinit KMP_EXPORT void __kmpc_omp_set_task_name(char *name); KMP_EXPORT void __kmpc_omp_set_task_attr(char key, long int value); +KMP_EXPORT void* __kmpc_omp_begin_sched_graph(int flag); +KMP_EXPORT void __kmpc_omp_end_sched_graph(void* handle, int flag ); KMP_EXPORT void __kmpc_omp_set_task_alloc_size( int rsrc, unsigned long size, void* data ); KMP_EXPORT void __kmpc_omp_set_task_free_size( int rsrc, unsigned long size, void* data ); diff --git a/runtime/src/kmp_config.h.cmake b/runtime/src/kmp_config.h.cmake index e20e43319ee90e1f73378880138353b120ee5888..c56daa369959970a65b4bca497b4e10827b6122b 100644 --- a/runtime/src/kmp_config.h.cmake +++ b/runtime/src/kmp_config.h.cmake @@ -53,7 +53,9 @@ #cmakedefine01 LIBOMP_USE_AFFINITY #cmakedefine01 LIBOMP_USE_AFFINITY #cmakedefine01 LIBOMP_USE_EXTSCHED_MEM +#cmakedefine01 LIBOMP_USE_REORDER4LOCALITY #cmakedefine01 LIBOMP_USE_THEQUEUE +#cmakedefine01 LIBOMP_USE_LINKED_DEQUEUE #cmakedefine01 LIBOMP_USE_THE_AGGREGATION #cmakedefine01 LIBOMP_USE_CONCURRENT_WRITE #cmakedefine01 LIBOMP_USE_DYNHASH diff --git a/runtime/src/kmp_ftn_entry.h b/runtime/src/kmp_ftn_entry.h index 3da40d790e60c35e1b602f69093382a019523b5f..47f65c69902b3569219b8c4457d5a2e922a404e2 100644 --- a/runtime/src/kmp_ftn_entry.h +++ b/runtime/src/kmp_ftn_entry.h @@ -659,6 +659,26 @@ FTN_SET_DEPEND_INFO(void * addr, int flag) #if OMP_40_ENABLED +void* FTN_STDCALL +FTN_BEGIN_SCHEDGRAPH(int flag ) +{ + #ifdef KMP_STUB + // nothing + #else + return __kmpc_omp_begin_sched_graph(flag); + #endif +} + +void FTN_STDCALL +FTN_END_SCHEDGRAPH(void* handle, int flag ) +{ + #ifdef KMP_STUB + // nothing + #else + __kmpc_omp_end_sched_graph(handle, flag); + #endif +} + #if LIBOMP_USE_VARDEP int FTN_STDCALL FN_TASK_DECLDEPS( int mode, int count, void** array) diff --git a/runtime/src/kmp_ftn_os.h b/runtime/src/kmp_ftn_os.h index e7ed60b51c8eed76ba77ad34768b416651a212b5..0457f3209656e368e64fb242a81ef6c25c2f3503 100644 --- a/runtime/src/kmp_ftn_os.h +++ b/runtime/src/kmp_ftn_os.h @@ -92,6 +92,8 @@ #define FN_TASK_DECLDEPS_ARRAY omp_task_declare_dependencies_array #define FN_TASK_DECLDEPS_ARRAY_NOALIAS omp_task_declare_dependencies_array_noalias #endif +#define FTN_BEGIN_SCHEDGRAPH omp_begin_sched_graph +#define FTN_END_SCHEDGRAPH omp_end_sched_graph #define FTN_GET_NUM_TEAMS omp_get_num_teams #define FTN_GET_TEAM_NUM omp_get_team_num #endif @@ -225,6 +227,8 @@ #define FN_TASK_DECLDEPS_ARRAY omp_task_declare_dependencies_array_ #define FN_TASK_DECLDEPS_ARRAY_NOALIAS omp_task_declare_dependencies_array_noalias_ #endif +#define FTN_BEGIN_SCHEDGRAPH omp_begin_sched_graph_ +#define FTN_END_SCHEDGRAPH omp_end_sched_graph_ #define FTN_GET_NUM_TEAMS omp_get_num_teams_ #define FTN_GET_TEAM_NUM omp_get_team_num_ #endif @@ -359,6 +363,8 @@ #define FN_TASK_DECLDEPS_ARRAY OMP_TASK_DECLARE_DEPENDENCIES_ARRAY #define FN_TASK_DECLDEPS_ARRAY_NOALIAS OMP_TASK_DECLARE_DEPENDENCIES_ARRAY_NOALIAS #endif +#define FTN_BEGIN_SCHEDGRAPH OMP_BEGIN_SCHED_GRAPH +#define FTN_END_SCHEDGRAPH OMP_END_SCHED_GRAPH #define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS #define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM #endif @@ -485,7 +491,7 @@ #define FTN_SET_TASK_FREE_MEM OMP_SET_TASK_FREE_SIZE_ #define FTN_INFO_ALLOC_SIZE OMP_INFO_ALLOC_MEMORY_ #define FTN_INFO_FREE_SIZE OMP_INFO_FREE_MEMORY_ -#define FTN_SET_MEMORY_LIMIT OMP_SET_MEMORY_LIMIT +#define FTN_SET_MEMORY_LIMIT OMP_SET_MEMORY_LIMIT_ #if OMP_40_ENABLED #if LIBOMP_USE_VARDEP #define FN_TASK_DECLDEPS OMP_TASK_DECLARE_DEPENDENCIES_ @@ -493,6 +499,8 @@ #define FN_TASK_DECLDEPS_ARRAY OMP_TASK_DECLARE_DEPENDENCIES_ARRAY_ #define FN_TASK_DECLDEPS_ARRAY_NOALIAS OMP_TASK_DECLARE_DEPENDENCIES_ARRAY_NOALIAS_ #endif +#define FTN_BEGIN_SCHEDGRAPH OMP_BEGIN_SCHED_GRAPH_ +#define FTN_END_SCHEDGRAPH OMP_END_SCHED_GRAPH_ #define FTN_GET_NUM_TEAMS OMP_GET_NUM_TEAMS_ #define FTN_GET_TEAM_NUM OMP_GET_TEAM_NUM_ #endif diff --git a/runtime/src/kmp_queues.cpp b/runtime/src/kmp_queues.cpp index 38e766db716d6a49b1e488527f0e058a77d02024..28df51a7f100d347b767c3885290d7cf702217cc 100644 --- a/runtime/src/kmp_queues.cpp +++ b/runtime/src/kmp_queues.cpp @@ -15,7 +15,7 @@ int __kmp_cpu2node(int cpu) { return machine_info.cpu2node[cpu]; } void __kmp_init_task_deque(kmp_queue_data_t *q, int level, int level_id, int node) { kmp_base_queue_data_t *queue = &q->qd; -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE kaapi_wsqueue_init( &queue->td_wsdeque, INITIAL_TASK_DEQUE_SIZE, node); #else __kmp_alloc_task_deque(queue, node); @@ -36,7 +36,7 @@ void __kmp_init_task_deque(kmp_queue_data_t *q, int level, int level_id, int nod void __kmp_alloc_task_deque(kmp_base_queue_data_t *queue, int node) { -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE kaapi_wsqueue_init( &queue->td_wsdeque, INITIAL_TASK_DEQUE_SIZE, node); #else /*TODO PV something? I need to allocate node queue on nodes...*/ @@ -74,7 +74,7 @@ __kmp_alloc_task_deque(kmp_base_queue_data_t *queue, int node) void __kmp_realloc_task_deque(kmp_base_queue_data_t *queue) { -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE kaapi_wsqueue_realloc( &queue->td_wsdeque ); #else kmp_int32 size = TASK_DEQUE_SIZE(queue); @@ -115,7 +115,9 @@ void __kmp_realloc_task_deque(kmp_base_queue_data_t *queue) void __kmp_free_task_deque( kmp_base_queue_data_t *own_queue ) { -#if !LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE + kaapi_wsqueue_fini( &own_queue->td_wsdeque ); +#else __kmp_acquire_bootstrap_lock( &own_queue->td_deque_lock ); if ( own_queue->td_deque != NULL ) { @@ -128,7 +130,5 @@ __kmp_free_task_deque( kmp_base_queue_data_t *own_queue ) own_queue->td_deque = NULL; } __kmp_release_bootstrap_lock( &own_queue->td_deque_lock ); -#else - kaapi_wsqueue_fini( &own_queue->td_wsdeque ); #endif } diff --git a/runtime/src/kmp_queues.h b/runtime/src/kmp_queues.h index 24c014327b8786f0b2e987aaa6faef36778f2d95..7972428295d897ac04a94b859197c612b5a06d83 100644 --- a/runtime/src/kmp_queues.h +++ b/runtime/src/kmp_queues.h @@ -9,7 +9,7 @@ static inline int __kmp_queue_empty(kmp_queue_data_t *queue) { -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE return kaapi_wsqueue_empty(&queue->qd.td_wsdeque); #else return TCR_4(queue ->qd. td_deque_ntasks) == 0; diff --git a/runtime/src/kmp_runtime.cpp b/runtime/src/kmp_runtime.cpp index 265d5749914313404c59df4d6afe5e4cea5f687f..67318b5b89791043f9e4b88cab3d1dea9c3ba7e7 100644 --- a/runtime/src/kmp_runtime.cpp +++ b/runtime/src/kmp_runtime.cpp @@ -4092,6 +4092,9 @@ static void __kmp_initialize_info(kmp_info_t *this_thr, kmp_team_t *team, this_thr->th.th_task_key[0] = 0; this_thr->th.th_task_key[1] = 0; #endif +#if LIBOMP_USE_REORDER4LOCALITY + this_thr->th.th_tasklist = 0; +#endif #if LIBOMP_USE_VARDEP && OMP_40_ENABLED this_thr->th.th_edps_size[0] = 0; this_thr->th.th_edps_size[1] = 0; diff --git a/runtime/src/kmp_settings.cpp b/runtime/src/kmp_settings.cpp index aca5b89b06a11225d272f862465f06bc1874bc6d..3e65def03d6070d7cce94cb5c02ae3b76ffd0dfa 100644 --- a/runtime/src/kmp_settings.cpp +++ b/runtime/src/kmp_settings.cpp @@ -5380,6 +5380,11 @@ void __kmp_env_print_2() { #else # define K_QUEUE "" #endif +#if LIBOMP_USE_LINKED_DEQUEUE +# define K_LQUEUE "+linked dqueue" +#else +# define K_LQUEUE "" +#endif #if LIBOMP_USE_CONCURRENT_WRITE && OMP_40_ENABLED # define K_CW "+Concurrent write" #else @@ -5396,9 +5401,9 @@ void __kmp_env_print_2() { # define K_DH "" #endif -#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE || LIBOMP_USE_THE_AGGREGATION \ +#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE || LIBOMP_USE_THE_AGGREGATION \ || LIBOMP_USE_CONCURRENT_WRITE || LIBOMP_USE_VARDEP || LIBOMP_USE_DYNHASH - __kmp_str_buf_print( & buffer, " OMP version enabled: " K_30 K_40 K_45 K_50 ", extended with: " K_QUEUE K_AFFINITY K_AGGREGATION K_CW K_DH K_VD"\n", 0 ); + __kmp_str_buf_print( & buffer, " OMP version enabled: " K_30 K_40 K_45 K_50 ", extended with: " K_QUEUE K_LQUEUE K_AFFINITY K_AGGREGATION K_CW K_DH K_VD"\n", 0 ); #endif __kmp_str_buf_print(&buffer, "%s\n", KMP_I18N_STR(DisplayEnvEnd)); diff --git a/runtime/src/kmp_taskdeps.cpp b/runtime/src/kmp_taskdeps.cpp index f6f5665ba456030bcd6d118834ec704fc3a40743..59aa5e9c99c48151dc002adfe0f9e2204a55b009 100644 --- a/runtime/src/kmp_taskdeps.cpp +++ b/runtime/src/kmp_taskdeps.cpp @@ -24,6 +24,11 @@ #include "kmp_atomic.h" #endif +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY +#include "kmp_taskreschedule.h" +#endif + + #if OMP_40_ENABLED // TODO: Improve memory allocation? keep a list of pre-allocated structures? @@ -546,6 +551,14 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash, __kmp_track_dependence(gtid, indep, outdep, task); indep->dn.successors = __kmp_add_node(thread, indep->dn.successors, outdep); +#if LIBOMP_USE_REORDER4LOCALITY + kaapi_reorder4locality_addDependency( + thread->th.th_tasklist, + KMP_TASK_TO_TASKDATA(indep->dn.task), + KMP_TASK_TO_TASKDATA(task), + dep + ); +#endif KA_TRACE(40, ("__kmp_process_deps<%d>: T#%d adding dependence from " "%p to %p\n", filter, gtid, KMP_TASK_TO_TASKDATA(indep->dn.task), @@ -598,6 +611,14 @@ __kmp_process_deps(kmp_int32 gtid, kmp_depnode_t *node, kmp_dephash_t *hash, __kmp_track_dependence(gtid, outdep, node, task); outdep->dn.successors = __kmp_add_node(thread, outdep->dn.successors, node); +#if LIBOMP_USE_REORDER4LOCALITY + kaapi_reorder4locality_addDependency( + thread->th.th_tasklist, + KMP_TASK_TO_TASKDATA(outdep->dn.task), + KMP_TASK_TO_TASKDATA(task), + dep + ); +#endif KA_TRACE( 40, ("__kmp_process_deps<%d>: T#%d adding dependence from %p to %p\n", @@ -1293,7 +1314,7 @@ kmp_int32 __kmpc_omp_task_with_deps(ident_t *loc_ref, kmp_int32 gtid, //TG: do not call the c entry point that may check vardep to call kmpc_omp_task_with_deps // generating infinite recursive function call. // return __kmpc_omp_task(loc_ref, gtid, new_task); - return __kmp_omp_task(gtid, new_task, true); + return __kmp_omp_task(gtid, new_task, false); // TG: it was a true ??? } /*! diff --git a/runtime/src/kmp_tasking.cpp b/runtime/src/kmp_tasking.cpp index 498c408cc4ba65ccd44b836e46bfe2e7ea602a11..e40af23e6bddafe5b6747218fe203037811d03f6 100644 --- a/runtime/src/kmp_tasking.cpp +++ b/runtime/src/kmp_tasking.cpp @@ -27,6 +27,13 @@ #include "tsan_annotations.h" +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY +#warning "HERE" +#include "kmp_taskreschedule.h" +#else +#endif + + #if LIBOMP_USE_AFFINITY #include <numaif.h> #ifndef _GNU_SOURCE @@ -34,7 +41,6 @@ #endif #include <sched.h> /* sched_getcpu */ - /* */ static int kaapi_numa_getpage_id(const void* addr) @@ -326,6 +332,14 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) KMP_DEBUG_ASSERT(TCR_4(task_team->tt.tt_found_tasks) == TRUE); KMP_DEBUG_ASSERT(TCR_PTR(task_team->tt.tt_threads_data) != NULL); +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + if (thread->th.th_tasklist) + { + kaapi_push_task( thread, taskdata ); + return TASK_SUCCESSFULLY_PUSHED; + } +#endif + // Find tasking deque specific to encountering thread thread_data = & task_team -> tt.tt_threads_data[ tid ]; @@ -335,7 +349,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) int selected_nodeid = -1; #endif -#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_EXTSCHED_MEM||LIBOMP_USE_AFFINITY +#if LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE || LIBOMP_USE_AFFINITY || LIBOMP_USE_EXTSCHED_MEM int isremote = 1; #endif @@ -359,7 +373,10 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) selected_queue = &task_team->tt.tt_task_queues_mem[KMP_LEVEL_MACHINE][0].qd; else #endif - selected_queue = &task_team->tt.tt_task_queues[KMP_LEVEL_THREAD][tid].qd; + if ( kaapi_wsqueue_empty(&task_team->tt.tt_task_private_queues[KMP_LEVEL_THREAD][tid].qd->td_wsdeque)) + selected_queue = &task_team->tt.tt_task_private_queues[KMP_LEVEL_THREAD][tid].qd; + else + selected_queue = &task_team->tt.tt_task_queues[KMP_LEVEL_THREAD][tid].qd; isremote = 0; KA_TRACE(10, ( "__kmp_push_task: pushing to own private queue, no affinity\n" ) ); } break; @@ -519,7 +536,7 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) KMP_DEBUG_ASSERT(selected_queue); // No lock needed since only owner can allocate -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE kaapi_wsqueue_t* pqueue = &(selected_queue->td_wsdeque); kmp_int32 err = 0; #if LIBOMP_USE_AFFINITY||LIBOMP_USE_EXTSCHED_MEM @@ -530,19 +547,28 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) taskdata ); else +#if LIBOMP_USE_AFFINITY + if (select_queue == &task_team->tt.tt_task_private_queues[KMP_LEVEL_THREAD][tid].qd) + err = kaapi_wsqueue_push_task( + pqueue, + isremote, + taskdata + ); + else +#endif #endif /* no affinity: 1 queue own by the current thread only, no concurrent "owners" */ - err = kaapi_wsqueue_push_task( - pqueue, - isremote, - taskdata - ); + err = kaapi_wsqueue_push_task( + pqueue, + isremote, + taskdata + ); if (err) { //printf("Task not pushed, remote:%i !\n", isremote); return TASK_NOT_PUSHED; } return TASK_SUCCESSFULLY_PUSHED; -#else /* LIBOMP_USE_THEQUEUE */ +#else /* LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE */ if (selected_queue -> td_deque == NULL ) { __kmp_alloc_task_deque( selected_queue, -1 ); @@ -1366,6 +1392,10 @@ kmp_task_t* __kmp_task_alloc( ident_t *loc_ref, kmp_int32 gtid, kmp_tasking_flag } else { task->shareds = NULL; } +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + taskdata->groupBegin = 0; + taskdata->groupEnd = 0; +#endif #if OMP_40_ENABLED taskdata->td_flags.depsfill= 0; if (ndeps && (flags->depsinalloc)) @@ -2225,6 +2255,29 @@ __kmpc_omp_set_task_attr(char key, long int value) #endif } + +//------------------------------------------------------------------------------------- +// __kmpc_omp_set_task_attr: set the name for the next task to create +void* __kmpc_omp_begin_sched_graph(int flag) +{ +#if LIBOMP_USE_REORDER4LOCALITY + kmp_info_t* thread = __kmp_threads[ __kmp_entry_gtid() ]; + kaapi_wsqueue_t* queue = kaapi_begin_graph(thread, (uint32_t)flag); + return queue; +#else + return 0; +#endif +} + +void __kmpc_omp_end_sched_graph(void* handle, int flag ) +{ +#if LIBOMP_USE_REORDER4LOCALITY + kmp_info_t* thread = __kmp_threads[ __kmp_entry_gtid() ]; + kaapi_end_graph(thread, (uint32_t)flag); +#endif +} + + //------------------------------------------------------------------------------------- /*! @ingroup TASKING @@ -2499,6 +2552,8 @@ static kmp_queue_data_t *__kmp_select_queue( { if (isapop) /* means pop operation */ { + *islocal = 1; + CHECK_RETURN(¤t_team->tt.tt_task_private_queues[KMP_LEVEL_THREAD][tid]); // cpu for (int i=0; i<2; ++i) { #if LIBOMP_USE_EXTSCHED_MEM @@ -2614,7 +2669,7 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 cpu, kmp_int32 node, kmp_in return NULL; } -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE KMP_DEBUG_ASSERT( gtid == __kmp_get_gtid() ); #if LIBOMP_USE_EXTSCHED_MEM //printf("%i::Islocal:%i\n", gtid, islocal); @@ -2645,8 +2700,15 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 cpu, kmp_int32 node, kmp_in else #endif if (islocal ==1) /* == own queue or locked pop on constraints */ - taskdata = kaapi_wsqueue_pop_task( &(selected_struct_queue->qd.td_wsdeque) ); - else + { +#if LIBOMP_USE_AFFINITY + kmp_int32 tid = thread->th.th_info.ds.ds_tid; + if (selected_struct_queue == &task_team->tt.tt_task_private_queues[KMP_LEVEL_THREAD][tid].qd) + taskdata = __kaapi_wsqueue_pop_task( &(selected_struct_queue->qd.td_wsdeque) ); + else +#endif + taskdata = kaapi_wsqueue_pop_task( &(selected_struct_queue->qd.td_wsdeque) ); + } else taskdata = kaapi_wsqueue_locked_pop_task( &(selected_struct_queue->qd.td_wsdeque) ); if (taskdata ==0) return 0; #if OMPT_SUPPORT @@ -2755,7 +2817,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 cpu, kmp_int32 node, KMP_DEBUG_ASSERT(__kmp_tasking_mode != tskm_immediate_exec); -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE if (TCR_PTR(victim->th.th_task_team) != task_team) { // GEH: why would this happen? return 0; @@ -2782,7 +2844,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 cpu, kmp_int32 node, if (__kmp_queue_empty(victim_struct_queue)) return NULL; -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " "task_team=%p queue=%p\n", gtid, __kmp_gtid_from_thread( victim ), task_team, @@ -2845,7 +2907,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 cpu, kmp_int32 node, KMP_COUNT_BLOCK(TASK_stolen); return task; -#else //LIBOMP_USE_THEQUEUE +#else //LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: " "task_team=%p ntasks=%d " @@ -3035,7 +3097,7 @@ static inline int __kmp_execute_tasks_template( state = IDLE_STATE; } #endif -#if !LIBOMP_USE_AFFINITY && !LIBOMP_USE_THEQUEUE +#if !LIBOMP_USE_AFFINITY && !LIBOMP_USE_THEQUEUE && !LIBOMP_USE_LINKED_DEQUEUE if (use_own_tasks) // check on own queue first #else /* loop to increase local execution */ @@ -3255,7 +3317,7 @@ static inline int __kmp_execute_tasks_template( use_own_tasks = 1; else #endif -#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE ||LIBOMP_USE_LINKED_DEQUEUE //TG: avoid to return if flag condition is not check - WARN: threads are higly active if (flag != NULL && flag->done_check()) #else @@ -3273,7 +3335,7 @@ static inline int __kmp_execute_tasks_template( return FALSE; } -#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_AFFINITY || LIBOMP_USE_THEQUEUE || LIBOMP_USE_LINKED_DEQUEUE if (final_spin) return FALSE; #else @@ -4057,7 +4119,7 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, kmp_base_queue_data_t *victim_queue = &task_team->tt.tt_task_queues[KMP_LEVEL_THREAD][cpu].qd; KMP_DEBUG_ASSERT(victim_queue); -#if !LIBOMP_USE_THEQUEUE +#if !LIBOMP_USE_THEQUEUE && !LIBOMP_USE_LINKED_DEQUEUE if (victim_queue->td_deque == NULL ) { // There's no queue in this thread, go find another one // We're guaranteed that at least one thread has a queue @@ -4068,7 +4130,7 @@ static bool __kmp_give_task(kmp_info_t *thread, kmp_int32 tid, kmp_task_t *task, } #endif -#if LIBOMP_USE_THEQUEUE +#if LIBOMP_USE_THEQUEUE||LIBOMP_USE_LINKED_DEQUEUE kmp_int32 gtid = thread->th.th_info.ds.ds_gtid; if (__kmp_push_task(gtid, task) == TASK_NOT_PUSHED) result = false; diff --git a/runtime/src/kmp_taskreschedule.cpp b/runtime/src/kmp_taskreschedule.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a7305fa3411c73edd9271e8a07f0ba03e0963e26 --- /dev/null +++ b/runtime/src/kmp_taskreschedule.cpp @@ -0,0 +1,148 @@ +// +// kmp_taskreschedule.cpp +// libomp +// +// Copyright © 2018 Jérôme Richard - Gautier Thierry. All rights reserved. +// +#include "kmp_config.h" +#include "kmp.h" +#include "kmp_i18n.h" + + +#include "kmp_taskreschedule.h" + +/* +*/ +kaapi_wsqueue_t* kaapi_begin_graph(kmp_info_t *thread, uint32_t flag) +{ +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + kmp_task_team_t* task_team = thread->th.th_task_team; + if (task_team ==0) return 0; + kaapi_wsqueue_t* ws; +#if USE_FAST_MEMORY + ws = (kaapi_wsqueue_t *)__kmp_fast_allocate(thread, sizeof(kaapi_wsqueue_t)); +#else + ws = (kaapi_wsqueue_t *)__kmp_thread_malloc(thread, sizeof(kaapi_wsqueue_t)); +#endif + kaapi_wsqueue_init( ws, INITIAL_TASK_DEQUE_SIZE, -1 ); + thread->th.th_tasklist = ws; + return ws; +#else + return 0; +#endif +} + + +/* +*/ +void kaapi_push_task( kmp_info_t* thread, kaapi_task_t* task ) +{ +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + kaapi_wsqueue_t* queue = thread->th.th_tasklist; + task->next = 0; + task->prev = queue->deque_T; + if (task->prev ==0) + queue->deque_H = task; + else + queue->deque_T->next = task; + queue->deque_T = task; + ++queue->deque_size; + + task->groupBegin = task; + task->groupEnd = task; +#endif +} + + +/* +*/ +void kaapi_end_graph(kmp_info_t *thread, uint32_t flag) +{ +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + if (thread->th.th_tasklist ==0) return; + /* select the queue */ + kmp_task_team_t* task_team = thread->th.th_task_team; + kmp_int32 tid = thread->th.th_info.ds.ds_tid; + kmp_base_queue_data_t *selected_queue = NULL; + +#if LIBOMP_USE_AFFINITY + int cpu = sched_getcpu(); + int selected_nodeid = __kmp_cpu2node(cpu); + + selected_queue = &task_team->tt.tt_task_private_queues[KMP_LEVEL_NUMA][selected_nodeid].qd; +#else + selected_queue = &task_team->tt.tt_task_queues[KMP_LEVEL_THREAD][tid].qd; +#endif + kaapi_wsqueue_t* queue = &(selected_queue->td_wsdeque); + kaapi_wsqueue_t* tasklist = thread->th.th_tasklist; + thread->th.th_tasklist = 0; + kaapi_wsqueue_push_tasklist( queue, tasklist ); + KMP_ASSERT( kaapi_wsqueue_empty(tasklist) ); +#if USE_FAST_MEMORY + __kmp_fast_free(thread, tasklist); +#else /* ! USE_FAST_MEMORY */ + __kmp_thread_free(thread, tasklist); +#endif +#endif +} + + +/* +*/ +void kaapi_reorder4locality_addDependency( + kaapi_wsqueue_t* ready_queue, + kaapi_task_t* predTask, + kaapi_task_t* currtask, + const kmp_depend_info_t* depinfo) +{ +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY + // First dependency + if(currtask->groupBegin == 0) + { + currtask->groupBegin = predTask->groupBegin; + currtask->groupEnd = predTask->groupEnd; + } + else + { + // Group of predecessor are not contiguous + // Assumption: submitted tasks are appended at the end of the ready list and dependencies are sorted + // Assumption: groups are not aliasing (to check: not always true but seems to be ok) + if(currtask->groupEnd->next != predTask->groupBegin) + { + // FAIL: group aliasing (with 3 levels) or replicated dependency + + // Hack to avoid handling the same task again when it comes to handle multiple dependencies + if(currtask->groupBegin != predTask->groupBegin) + { + // Avoid partially the reordering of predTask antecedents if they are shared with those of currtask + // TODO: support the general case + if(currtask->groupEnd != predTask->groupEnd) + { + kaapi_task_t* itInsert = currtask->groupEnd; + kaapi_task_t* itBegin = predTask->groupBegin; + kaapi_task_t* itEnd = predTask->groupEnd; + bool reorder = true; + + // In some case the algorithm fail to keep the coherence of data structure such as group markers + // Iterating over the whole ready list is sufficient to check if something go wrong + for(kaapi_task_t* it=itBegin ; it!=itEnd ; it=it->next) + { + if(it == NULL || it == itInsert) + { + reorder = false; + break; + } + } + + // Improve locality regarding submission order by moving the predecessor group near the last one + if(reorder) + kaapi_wsqueue_splice( ready_queue, itInsert, itBegin, itEnd); + } + } + } + + currtask->groupEnd = predTask->groupEnd; + } +#endif +} + diff --git a/runtime/src/kmp_taskreschedule.h b/runtime/src/kmp_taskreschedule.h new file mode 100644 index 0000000000000000000000000000000000000000..8bf117ad3ffdc41ad18bc5a457bd4fd74885b7f0 --- /dev/null +++ b/runtime/src/kmp_taskreschedule.h @@ -0,0 +1,41 @@ +// +// kmp_taskreschedule.hpp +// libomp +// +// Copyright © 2018 Jérôme Richard - Gautier Thierry. All rights reserved. +// + +#ifndef kmp_taskreschedule_h +#define kmp_taskreschedule_h + +#include "kaapi_wsprotocol.h" + +/* Mark start of graph construction with schedule + Flag can be used to specify scheduler heuristic. + Each future task generated by the current thread until a call to kaapi_end_graph() is + stored either the returned queue xor its depends on previous tasks. + The returned queue is the queue of ready tasks submited to the workers in kaapi_end_graph() +*/ +KMP_EXPORT kaapi_wsqueue_t* kaapi_begin_graph(kmp_info_t *thread, uint32_t flag); + +#if OMP_40_ENABLED && LIBOMP_USE_REORDER4LOCALITY +/* +*/ +KMP_EXPORT void kaapi_push_task( kmp_info_t* thread, kaapi_task_t* task ); + +/* Add dependency between 2 tasks. + taskPred has successor task by the dependency depinfo. + ready_queue is the queue to reorder. +*/ +KMP_EXPORT void kaapi_reorder4locality_addDependency( + kaapi_wsqueue_t* ready_queue, + kaapi_task_t* predTask, + kaapi_task_t* task, + const kmp_depend_info_t* depinfo); +#endif + +/* Submit the queue of ready task to the workers. +*/ +KMP_EXPORT void kaapi_end_graph(kmp_info_t *thread, uint32_t flag); + +#endif /* kmp_taskreschedule_hpp */