diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h index 8744af05f200271638f4615baf95ffbc04937e8f..76ae3398aef14a1b6cdb9323ac218d85dd6502ff 100644 --- a/runtime/src/kmp.h +++ b/runtime/src/kmp.h @@ -2363,6 +2363,8 @@ typedef struct kmp_base_task_team { #if 1 || LIBOMP_USE_AFFINITY // Mask of NUMA nodes active for this team kmp_affin_mask_t *tt_nodes_mask; + kmp_int32 tt_nodes_count; + kmp_queue_data_t **tt_nodes_tasks_queues; /* of size tt_nodes_count */ #endif KMP_ALIGN_CACHE diff --git a/runtime/src/kmp_hws.h b/runtime/src/kmp_hws.h index dc495d2e83b464adbde1c985c32fabb99539f84e..73793b819ef87a859e1a6895351784adcd43c3f4 100644 --- a/runtime/src/kmp_hws.h +++ b/runtime/src/kmp_hws.h @@ -20,7 +20,9 @@ class MachineInfo { ~MachineInfo(); kmp_queue_data_t *core(int cpu) { return &places[KMP_LEVEL_CORE][cpu]; } + kmp_int32 cores() { return places_count[KMP_LEVEL_CORE]; } kmp_queue_data_t *node(int node) { return &places[KMP_LEVEL_NUMA][node]; } + kmp_int32 nodes() const { return places_count[KMP_LEVEL_NUMA]; } kmp_queue_data_t *node_from_cpu(int cpu) { return &places[KMP_LEVEL_NUMA][numa_node_of_cpu(cpu)]; } kmp_queue_data_t *machine() { return &places[KMP_LEVEL_MACHINE][0]; } diff --git a/runtime/src/kmp_queues.cpp b/runtime/src/kmp_queues.cpp index bedc5861b8d8ace0ffb5906ce22e76d330a57978..2fd37ef6590ff4347f1520e9c27daa9713c02597 100644 --- a/runtime/src/kmp_queues.cpp +++ b/runtime/src/kmp_queues.cpp @@ -10,7 +10,9 @@ kmp_queue_data_t *__kmp_queue_of_core(int core) { return machine_places.core(core); }; +kmp_int32 __kmp_machine_cores() { return machine_places.cores(); }; kmp_queue_data_t *__kmp_queue_of_node(int node) { return machine_places.node(node); }; +kmp_int32 __kmp_machine_nodes() { return machine_places.nodes(); }; kmp_queue_data_t *__kmp_node_queue_of_core(int core) { return machine_places.node_from_cpu(core); }; kmp_queue_data_t *__kmp_machine_queue() { return machine_places.machine(); }; diff --git a/runtime/src/kmp_tasking.c b/runtime/src/kmp_tasking.c index 69d9396b6f90962376e0ae4dfcdee4c13fbf4d0a..70b67931d094b9e633d68d293d450e902ebae97d 100644 --- a/runtime/src/kmp_tasking.c +++ b/runtime/src/kmp_tasking.c @@ -341,12 +341,6 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) /*TODO PV this is a system call, we should improve this!*/ selected_nodeid = kaapi_numa_getpage_id( (void*)taskdata->td_aff_tag ); KA_TRACE(5, ( "__kmp_push_task: selected node based on mem addr: %i\n", selected_nodeid ) ); - /*Intentional no break, we need to get the queue!*/ - /*affinity "node" => numa id. if the numa node is not in the team, push to the first node*/ - case 2: - if (taskdata->td_aff_kind == 2) - selected_nodeid = (int)taskdata->td_aff_tag; - KA_TRACE(5, ( "__kmp_push_task: selected node based on number: %i\n", selected_nodeid ) ); if (KMP_CPU_ISSET(selected_nodeid, task_team->tt.tt_nodes_mask)) { KA_TRACE(5, ( "__kmp_push_task: node is in team!\n" ) ); selected_queue = &(__kmp_queue_of_node(selected_nodeid)->qd); @@ -356,6 +350,14 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task ) selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd; } break; + /*affinity "node" => numa id. if the numa node is not in the team, push to the first node*/ + case 2: + selected_nodeid = (int)taskdata->td_aff_tag; + KA_TRACE(5, ( "__kmp_push_task: selected node based on number: %i\n", selected_nodeid ) ); + if (selected_nodeid >= task_team->tt.tt_nodes_count) + selected_nodeid %= (int)task_team->tt.tt_nodes_count; + selected_queue = &task_team->tt.tt_nodes_tasks_queues[selected_nodeid]->qd; + break; /*affinity "core" => tid or cpu id on the machine, todo -> clarify this*/ case 3: selected_coreid = (int)taskdata->td_aff_tag; @@ -1790,85 +1792,54 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ]; /*TODO PV hws remove queue! (in this case, CORE queue is probably right)*/ - kmp_base_queue_data_t *selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd; - /*We may not be initialized yet!*/ - if (!selected_queue) - return NULL; + kmp_base_queue_data_t *queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd; #if LIBOMP_USE_THEQUEUE KMP_DEBUG_ASSERT( gtid == __kmp_get_gtid() ); taskdata = 0; - if (!kaapi_wsqueue_empty(&(selected_queue->td_wsdeque))) + if (!kaapi_wsqueue_empty(&queue->td_wsdeque)) { - taskdata = kaapi_wsqueue_pop_task( &(selected_queue->td_wsdeque) ); + /* KMP_LEVEL_CORE queue is shared among all threads (that can push/pop): use locked pop */ + taskdata = kaapi_wsqueue_locked_pop_task( &queue->td_wsdeque ); if (taskdata) goto igotatask; } - { /* scope */ -#if 0 && LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */ - int local_node = kaapi_cpu2numa[cpu]; - -#if 0 - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue)) - { - taskdata = kaapi_wsqueue_locked_pop_task( - kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue - ); - if (taskdata) goto igotatask; - } - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue)) +#if LIBOMP_USE_AFFINITY + queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd; + if (!kaapi_wsqueue_empty(&queue->td_wsdeque)) { - taskdata = kaapi_wsqueue_locked_pop_task( - kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue - ); + taskdata = kaapi_wsqueue_locked_pop_task(&queue->td_wsdeque); if (taskdata) goto igotatask; } #endif - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue)) - { - taskdata = kaapi_wsqueue_locked_pop_task( - kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue - ); - if (taskdata) goto igotatask; - } - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue)) - { - taskdata = kaapi_wsqueue_locked_pop_task( - kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue - ); - if (taskdata) goto igotatask; - } -#endif - } /* scope */ - igotatask: if (taskdata ==0) return 0; task = KMP_TASKDATA_TO_TASK( taskdata ); return task; #else KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n", - gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head, - selected_queue->td_deque_tail) ); + gtid, queue->td_deque_ntasks, queue->td_deque_head, + queue->td_deque_tail) ); - if (TCR_4(selected_queue -> td_deque_ntasks) == 0) { + if (TCR_4(queue -> td_deque_ntasks) == 0) { KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n", - gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head, - selected_queue->td_deque_tail) ); + gtid, queue->td_deque_ntasks, queue->td_deque_head, + queue->td_deque_tail) ); return NULL; } - __kmp_acquire_bootstrap_lock( & selected_queue -> td_deque_lock ); + __kmp_acquire_bootstrap_lock( & queue -> td_deque_lock ); - if (TCR_4(selected_queue -> td_deque_ntasks) == 0) { - __kmp_release_bootstrap_lock( & selected_queue -> td_deque_lock ); + if (TCR_4(queue -> td_deque_ntasks) == 0) { + __kmp_release_bootstrap_lock( & queue -> td_deque_lock ); KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n", - gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head, - selected_queue->td_deque_tail) ); + gtid, queue->td_deque_ntasks, queue->td_deque_head, + queue->td_deque_tail) ); return NULL; } - kmp_uint32 tail = ( selected_queue -> td_deque_tail - 1 ) & TASK_DEQUE_MASK(selected_queue); // Wrap index. - taskdata = selected_queue -> td_deque[ tail ]; + kmp_uint32 tail = ( queue -> td_deque_tail - 1 ) & TASK_DEQUE_MASK(queue); // Wrap index. + taskdata = queue -> td_deque[ tail ]; if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) { // we need to check if the candidate obeys task scheduling constraint: @@ -1882,22 +1853,22 @@ igotatask: } if ( parent != current ) { // If the tail task is not a child, then no other child can appear in the deque. - __kmp_release_bootstrap_lock( & selected_queue -> td_deque_lock ); + __kmp_release_bootstrap_lock( & queue -> td_deque_lock ); KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n", - gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head, - selected_queue->td_deque_tail) ); + gtid, queue->td_deque_ntasks, queue->td_deque_head, + queue->td_deque_tail) ); return NULL; } } - selected_queue -> td_deque_tail = tail; - TCW_4(selected_queue -> td_deque_ntasks, selected_queue -> td_deque_ntasks - 1); + queue -> td_deque_tail = tail; + TCW_4(queue -> td_deque_ntasks, queue -> td_deque_ntasks - 1); - __kmp_release_bootstrap_lock( & selected_queue->td_deque_lock ); + __kmp_release_bootstrap_lock( & queue->td_deque_lock ); KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n", - gtid, taskdata, selected_queue->td_deque_ntasks, selected_queue->td_deque_head, - selected_queue->td_deque_tail) ); + gtid, taskdata, queue->td_deque_ntasks, queue->td_deque_head, + queue->td_deque_tail) ); task = KMP_TASKDATA_TO_TASK( taskdata ); return task; @@ -1917,14 +1888,15 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team { kmp_task_t * task; kmp_taskdata_t * taskdata; - kmp_thread_data_t *victim_td, *threads_data; + kmp_thread_data_t *victim_td, *thread_td, *threads_data; + kmp_int32 thread_tid = __kmp_tid_from_gtid( gtid ); kmp_int32 victim_tid; KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec ); threads_data = task_team -> tt.tt_threads_data; KMP_DEBUG_ASSERT( threads_data != NULL ); // Caller should check this condition - + thread_td = & threads_data[ thread_tid ]; victim_tid = victim->th.th_info.ds.ds_tid; victim_td = & threads_data[ victim_tid ]; /*TODO PV hws selection!*/ @@ -1940,86 +1912,48 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team } taskdata = 0; - -/*#if LIBOMP_USE_AFFINITY*/ -/*#if defined(__linux__)*/ - /*int cpu = sched_getcpu();*/ -/*#else*/ - /*int cpu = 0;*/ -/*#endif*/ - /*int local_node = kaapi_cpu2numa[cpu];*/ - /*kmp_info_t* thread = __kmp_threads[ gtid ];*/ +#if LIBOMP_USE_AFFINITY for (int ii=0; ii<8; ++ii) { for (int i=0; i<8; ++i) { sched_yield(); - if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque))) - taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) ); - } - } -#if 0 -#if 0 - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue)) - { - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue - ); - if (taskdata) goto return_from; - } - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue)) - { - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue - ); - if (taskdata) goto return_from; - } -#endif - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue)) - { - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue - ); - if (taskdata) goto return_from; - } - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue)) - { - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue - ); - if (taskdata) goto return_from; + victim_queue = &thread_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd; + if (!kaapi_wsqueue_empty(&victim_queue->td_wsdeque)) + { + taskdata = kaapi_wsqueue_steal_task( &victim_queue->td_wsdeque ); + if (taskdata) goto return_from; + } + victim_queue = &thread_td->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd; + if (!kaapi_wsqueue_empty(&victim_queue->td_wsdeque)) + { + taskdata = kaapi_wsqueue_steal_task( &victim_queue->td_wsdeque ); + if (taskdata) goto return_from; + } } - } -#if 1 + + victim_queue = &victim_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd; if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque))) { taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) ); if (taskdata) goto return_from; } + +#if 0 + { /* random steal on NUMA node */ + int numa_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]; + if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue)) + taskdata = kaapi_wsqueue_steal_task( + kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue + ); + if (taskdata) goto return_from; + } #endif - { - /* random steal on NUMA node */ - int numa_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]; - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue)) - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue - ); - if (taskdata) goto return_from; - } } -#if 0 - { - /* random steal on Core */ - int core_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE]; - if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][core_victim]->queue)) - taskdata = kaapi_wsqueue_steal_task( - kaapi_all_places[KAAPI_HWS_LEVELID_CORE][core_victim]->queue - ); - if (taskdata) goto return_from; - } -#endif #endif -/*#endif*//*old USE_AFFINITY*/ + victim_queue = &victim_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd; + if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque))) + taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) ); return_from: if (taskdata ==0) return 0; task = KMP_TASKDATA_TO_TASK( taskdata ); @@ -2176,9 +2110,6 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti # else int cpu = 0; # endif -#if LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */ - int local_node = kaapi_cpu2numa[cpu]; -#endif #endif #if OMP_45_ENABLED @@ -2299,9 +2230,8 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti if (!use_own_tasks && (!kaapi_wsqueue_empty(& own_queue->td_wsdeque) #if LIBOMP_USE_AFFINITY - || - !kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue) || - !kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue) + || !kaapi_wsqueue_empty( &threads_data[tid].td.td_tasks_queues[KMP_LEVEL_NUMA]->qd.td_wsdeque ) + || !kaapi_wsqueue_empty( &threads_data[tid].td.td_tasks_queues[KMP_LEVEL_CORE]->qd.td_wsdeque ) #endif ) ) @@ -2461,6 +2391,7 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr ) } } +#if LIBOMP_USE_AFFINITY /*Reset team hierarchical queues.*/ /*TODO PV move this to the team creation!!!*/ kmp_affin_mask_t *all_cpus; @@ -2468,7 +2399,6 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr ) if (!task_team->tt.tt_nodes_mask) KMP_CPU_ALLOC(task_team->tt.tt_nodes_mask); - for (i = 0; i < nthreads; i++) { kmp_base_thread_data_t *td = &threads_data[i].td; int physical_core = KMP_CPU_FIRST(threads_data[i].td.td_thr->th.th_affin_mask); @@ -2478,9 +2408,18 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr ) td->td_tasks_queues[KMP_LEVEL_NUMA] = __kmp_node_queue_of_core(physical_core); td->td_tasks_queues[KMP_LEVEL_MACHINE] = __kmp_machine_queue(); KA_TRACE( 10, ( "__kmp_enable_tasking: Using physical_core %u for thread %u\n", physical_core, i ) ); - } + /* TODO TG: move code to team creation - allocate new queues and does not share queues between several team */ hwloc_cpuset_to_nodeset(__kmp_hwloc_topology, (hwloc_cpuset_t)all_cpus, (hwloc_nodeset_t)task_team->tt.tt_nodes_mask); + task_team->tt.tt_nodes_count = hwloc_bitmap_weight( (hwloc_nodeset_t)task_team->tt.tt_nodes_mask ); + task_team->tt.tt_nodes_tasks_queues = (kmp_queue_data_t**)__kmp_allocate( task_team->tt.tt_nodes_count * sizeof(kmp_queue_data_t*)); + hwloc_nodeset_t tmp = hwloc_bitmap_dup( (hwloc_nodeset_t)task_team->tt.tt_nodes_mask ); + for (i = 0; i < task_team->tt.tt_nodes_count; i++) { + int index = hwloc_bitmap_first( tmp ); + hwloc_bitmap_clr( tmp, index ); + task_team->tt.tt_nodes_tasks_queues[i] = __kmp_queue_of_node( index ); + } +#endif KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n", __kmp_gtid_from_thread( this_thr ) ) );