diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index 8744af05f200271638f4615baf95ffbc04937e8f..76ae3398aef14a1b6cdb9323ac218d85dd6502ff 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -2363,6 +2363,8 @@ typedef struct kmp_base_task_team {
 #if 1 || LIBOMP_USE_AFFINITY
     // Mask of NUMA nodes active for this team
     kmp_affin_mask_t        *tt_nodes_mask;
+    kmp_int32               tt_nodes_count;
+    kmp_queue_data_t      **tt_nodes_tasks_queues; /* of size tt_nodes_count */
 #endif
 
     KMP_ALIGN_CACHE
diff --git a/runtime/src/kmp_hws.h b/runtime/src/kmp_hws.h
index dc495d2e83b464adbde1c985c32fabb99539f84e..73793b819ef87a859e1a6895351784adcd43c3f4 100644
--- a/runtime/src/kmp_hws.h
+++ b/runtime/src/kmp_hws.h
@@ -20,7 +20,9 @@ class MachineInfo {
   ~MachineInfo();
 
   kmp_queue_data_t *core(int cpu) { return &places[KMP_LEVEL_CORE][cpu]; }
+  kmp_int32 cores() { return places_count[KMP_LEVEL_CORE]; }
   kmp_queue_data_t *node(int node) { return &places[KMP_LEVEL_NUMA][node]; }
+  kmp_int32 nodes() const { return places_count[KMP_LEVEL_NUMA]; }
   kmp_queue_data_t *node_from_cpu(int cpu) { return &places[KMP_LEVEL_NUMA][numa_node_of_cpu(cpu)]; }
   kmp_queue_data_t *machine() { return &places[KMP_LEVEL_MACHINE][0]; }
 
diff --git a/runtime/src/kmp_queues.cpp b/runtime/src/kmp_queues.cpp
index bedc5861b8d8ace0ffb5906ce22e76d330a57978..2fd37ef6590ff4347f1520e9c27daa9713c02597 100644
--- a/runtime/src/kmp_queues.cpp
+++ b/runtime/src/kmp_queues.cpp
@@ -10,7 +10,9 @@
 
 
 kmp_queue_data_t *__kmp_queue_of_core(int core) { return machine_places.core(core); };
+kmp_int32         __kmp_machine_cores() { return machine_places.cores(); };
 kmp_queue_data_t *__kmp_queue_of_node(int node) { return machine_places.node(node); };
+kmp_int32         __kmp_machine_nodes() { return machine_places.nodes(); };
 kmp_queue_data_t *__kmp_node_queue_of_core(int core) { return machine_places.node_from_cpu(core); };
 kmp_queue_data_t *__kmp_machine_queue() { return machine_places.machine(); };
 
diff --git a/runtime/src/kmp_tasking.c b/runtime/src/kmp_tasking.c
index 69d9396b6f90962376e0ae4dfcdee4c13fbf4d0a..70b67931d094b9e633d68d293d450e902ebae97d 100644
--- a/runtime/src/kmp_tasking.c
+++ b/runtime/src/kmp_tasking.c
@@ -341,12 +341,6 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
         /*TODO PV this is a system call, we should improve this!*/
         selected_nodeid = kaapi_numa_getpage_id( (void*)taskdata->td_aff_tag );
         KA_TRACE(5, ( "__kmp_push_task: selected node based on mem addr: %i\n", selected_nodeid ) );
-        /*Intentional no break, we need to get the queue!*/
-      /*affinity "node" => numa id. if the numa node is not in the team, push to the first node*/
-      case 2:
-        if (taskdata->td_aff_kind == 2)
-          selected_nodeid = (int)taskdata->td_aff_tag;
-        KA_TRACE(5, ( "__kmp_push_task: selected node based on number: %i\n", selected_nodeid ) );
         if (KMP_CPU_ISSET(selected_nodeid, task_team->tt.tt_nodes_mask)) {
           KA_TRACE(5, ( "__kmp_push_task: node is in team!\n" ) );
           selected_queue = &(__kmp_queue_of_node(selected_nodeid)->qd);
@@ -356,6 +350,14 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
           selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd;
         }
         break;
+      /*affinity "node" => numa id. if the numa node is not in the team, push to the first node*/
+      case 2:
+        selected_nodeid = (int)taskdata->td_aff_tag;
+        KA_TRACE(5, ( "__kmp_push_task: selected node based on number: %i\n", selected_nodeid ) );
+        if (selected_nodeid >= task_team->tt.tt_nodes_count) 
+          selected_nodeid %= (int)task_team->tt.tt_nodes_count;
+        selected_queue = &task_team->tt.tt_nodes_tasks_queues[selected_nodeid]->qd;
+        break;
       /*affinity "core" => tid or cpu id on the machine, todo -> clarify this*/
       case 3:
         selected_coreid = (int)taskdata->td_aff_tag;
@@ -1790,85 +1792,54 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task
 
     thread_data = & task_team -> tt.tt_threads_data[ __kmp_tid_from_gtid( gtid ) ];
     /*TODO PV hws remove queue! (in this case, CORE queue is probably right)*/
-    kmp_base_queue_data_t *selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
-    /*We may not be initialized yet!*/
-    if (!selected_queue)
-      return NULL;
+    kmp_base_queue_data_t *queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
 
 #if LIBOMP_USE_THEQUEUE
     KMP_DEBUG_ASSERT( gtid == __kmp_get_gtid() );
     taskdata = 0;
-    if (!kaapi_wsqueue_empty(&(selected_queue->td_wsdeque)))
+    if (!kaapi_wsqueue_empty(&queue->td_wsdeque))
     {
-      taskdata = kaapi_wsqueue_pop_task( &(selected_queue->td_wsdeque) );
+      /* KMP_LEVEL_CORE queue is shared among all threads (that can push/pop): use locked pop */
+      taskdata = kaapi_wsqueue_locked_pop_task( &queue->td_wsdeque );
       if (taskdata) goto igotatask;
     }
-    { /* scope  */
-#if 0 && LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */
-    int local_node = kaapi_cpu2numa[cpu];
-
-#if 0
-    if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue))
-    {
-      taskdata = kaapi_wsqueue_locked_pop_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue
-      );
-      if (taskdata) goto igotatask;
-    }
-    if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue))
+#if LIBOMP_USE_AFFINITY 
+    queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd;
+    if (!kaapi_wsqueue_empty(&queue->td_wsdeque))
     {
-      taskdata = kaapi_wsqueue_locked_pop_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue
-      );
+      taskdata = kaapi_wsqueue_locked_pop_task(&queue->td_wsdeque);
       if (taskdata) goto igotatask;
     }
 #endif
 
-    if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue))
-    {
-      taskdata = kaapi_wsqueue_locked_pop_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue
-      );
-      if (taskdata) goto igotatask;
-    }
-    if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue))
-    {
-      taskdata = kaapi_wsqueue_locked_pop_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue
-      );
-      if (taskdata) goto igotatask;
-    }
-#endif
-    } /* scope  */
-
 igotatask:
     if (taskdata ==0) return 0;
     task = KMP_TASKDATA_TO_TASK( taskdata );
     return task;
 #else
     KA_TRACE(10, ("__kmp_remove_my_task(enter): T#%d ntasks=%d head=%u tail=%u\n",
-                  gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head,
-                  selected_queue->td_deque_tail) );
+                  gtid, queue->td_deque_ntasks, queue->td_deque_head,
+                  queue->td_deque_tail) );
 
-    if (TCR_4(selected_queue -> td_deque_ntasks) == 0) {
+    if (TCR_4(queue -> td_deque_ntasks) == 0) {
         KA_TRACE(10, ("__kmp_remove_my_task(exit #1): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
-                      gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head,
-                      selected_queue->td_deque_tail) );
+                      gtid, queue->td_deque_ntasks, queue->td_deque_head,
+                      queue->td_deque_tail) );
         return NULL;
     }
 
-    __kmp_acquire_bootstrap_lock( & selected_queue -> td_deque_lock );
+    __kmp_acquire_bootstrap_lock( & queue -> td_deque_lock );
 
-    if (TCR_4(selected_queue -> td_deque_ntasks) == 0) {
-        __kmp_release_bootstrap_lock( & selected_queue -> td_deque_lock );
+    if (TCR_4(queue -> td_deque_ntasks) == 0) {
+        __kmp_release_bootstrap_lock( & queue -> td_deque_lock );
         KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
-                      gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head,
-                      selected_queue->td_deque_tail) );
+                      gtid, queue->td_deque_ntasks, queue->td_deque_head,
+                      queue->td_deque_tail) );
         return NULL;
     }
 
-    kmp_uint32 tail = ( selected_queue -> td_deque_tail - 1 ) & TASK_DEQUE_MASK(selected_queue);  // Wrap index.
-    taskdata = selected_queue -> td_deque[ tail ];
+    kmp_uint32 tail = ( queue -> td_deque_tail - 1 ) & TASK_DEQUE_MASK(queue);  // Wrap index.
+    taskdata = queue -> td_deque[ tail ];
 
     if (is_constrained && (taskdata->td_flags.tiedness == TASK_TIED)) {
         // we need to check if the candidate obeys task scheduling constraint:
@@ -1882,22 +1853,22 @@ igotatask:
         }
         if ( parent != current ) {
             // If the tail task is not a child, then no other child can appear in the deque.
-            __kmp_release_bootstrap_lock( & selected_queue -> td_deque_lock );
+            __kmp_release_bootstrap_lock( & queue -> td_deque_lock );
             KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d No tasks to remove: ntasks=%d head=%u tail=%u\n",
-                          gtid, selected_queue->td_deque_ntasks, selected_queue->td_deque_head,
-                          selected_queue->td_deque_tail) );
+                          gtid, queue->td_deque_ntasks, queue->td_deque_head,
+                          queue->td_deque_tail) );
             return NULL;
         }
     }
 
-    selected_queue -> td_deque_tail = tail;
-    TCW_4(selected_queue -> td_deque_ntasks, selected_queue -> td_deque_ntasks - 1);
+    queue -> td_deque_tail = tail;
+    TCW_4(queue -> td_deque_ntasks, queue -> td_deque_ntasks - 1);
 
-    __kmp_release_bootstrap_lock( & selected_queue->td_deque_lock );
+    __kmp_release_bootstrap_lock( & queue->td_deque_lock );
 
     KA_TRACE(10, ("__kmp_remove_my_task(exit #2): T#%d task %p removed: ntasks=%d head=%u tail=%u\n",
-                  gtid, taskdata, selected_queue->td_deque_ntasks, selected_queue->td_deque_head,
-                  selected_queue->td_deque_tail) );
+                  gtid, taskdata, queue->td_deque_ntasks, queue->td_deque_head,
+                  queue->td_deque_tail) );
 
     task = KMP_TASKDATA_TO_TASK( taskdata );
     return task;
@@ -1917,14 +1888,15 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
 {
     kmp_task_t * task;
     kmp_taskdata_t * taskdata;
-    kmp_thread_data_t *victim_td, *threads_data;
+    kmp_thread_data_t *victim_td, *thread_td, *threads_data;
+    kmp_int32 thread_tid = __kmp_tid_from_gtid( gtid );
     kmp_int32 victim_tid;
 
     KMP_DEBUG_ASSERT( __kmp_tasking_mode != tskm_immediate_exec );
 
     threads_data = task_team -> tt.tt_threads_data;
     KMP_DEBUG_ASSERT( threads_data != NULL );  // Caller should check this condition
-
+    thread_td = & threads_data[ thread_tid ];
     victim_tid = victim->th.th_info.ds.ds_tid;
     victim_td = & threads_data[ victim_tid ];
     /*TODO PV hws selection!*/
@@ -1940,86 +1912,48 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
     }
     taskdata = 0;
 
-
-/*#if LIBOMP_USE_AFFINITY*/
-/*#if defined(__linux__)*/
-    /*int cpu = sched_getcpu();*/
-/*#else*/
-    /*int cpu = 0;*/
-/*#endif*/
-    /*int local_node = kaapi_cpu2numa[cpu];*/
-    /*kmp_info_t*  thread = __kmp_threads[ gtid ];*/
+#if LIBOMP_USE_AFFINITY
     for (int ii=0; ii<8; ++ii)
     {
       for (int i=0; i<8; ++i)
       {
         sched_yield();
-        if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque)))
-          taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) );
-      }
-    }
-#if 0
-#if 0
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue))
-      {
-        taskdata = kaapi_wsqueue_steal_task(
-            kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue
-        );
-        if (taskdata) goto return_from;
-      }
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue))
-      {
-        taskdata = kaapi_wsqueue_steal_task(
-            kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->queue
-        );
-        if (taskdata) goto return_from;
-      }
-#endif
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue))
-      {
-        taskdata = kaapi_wsqueue_steal_task(
-            kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue
-        );
-        if (taskdata) goto return_from;
-      }
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue))
-      {
-        taskdata = kaapi_wsqueue_steal_task(
-            kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue
-        );
-        if (taskdata) goto return_from;
+        victim_queue = &thread_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
+        if (!kaapi_wsqueue_empty(&victim_queue->td_wsdeque))
+        {
+          taskdata = kaapi_wsqueue_steal_task( &victim_queue->td_wsdeque );
+          if (taskdata) goto return_from;
+        }
+        victim_queue = &thread_td->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd;
+        if (!kaapi_wsqueue_empty(&victim_queue->td_wsdeque))
+        {
+          taskdata = kaapi_wsqueue_steal_task( &victim_queue->td_wsdeque );
+          if (taskdata) goto return_from;
+        }
       }
-    }
-#if 1
+
+      victim_queue = &victim_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
       if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque)))
       {
         taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) );
         if (taskdata) goto return_from;
       }
+
+#if 0 
+      { /* random steal on NUMA node */
+        int numa_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA];
+        if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue))
+          taskdata = kaapi_wsqueue_steal_task(
+            kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue
+          );
+        if (taskdata) goto return_from;
+      }
 #endif
-    {
-      /* random steal on NUMA node */
-      int numa_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA];
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue))
-        taskdata = kaapi_wsqueue_steal_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numa_victim]->queue
-        );
-      if (taskdata) goto return_from;
-    }
   }
-#if 0
-    {
-      /* random steal on Core */
-      int core_victim = __kmp_get_random(thread) % kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE];
-      if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][core_victim]->queue))
-        taskdata = kaapi_wsqueue_steal_task(
-          kaapi_all_places[KAAPI_HWS_LEVELID_CORE][core_victim]->queue
-        );
-      if (taskdata) goto return_from;
-    }
-#endif
 #endif
-/*#endif*//*old USE_AFFINITY*/
+  victim_queue = &victim_td->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
+  if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque)))
+     taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) );
 return_from:
     if (taskdata ==0) return 0;
     task = KMP_TASKDATA_TO_TASK( taskdata );
@@ -2176,9 +2110,6 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
 #  else
     int cpu = 0;
 #  endif
-#if LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */
-    int local_node = kaapi_cpu2numa[cpu];
-#endif
 #endif
 
 #if OMP_45_ENABLED
@@ -2299,9 +2230,8 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
             if (!use_own_tasks &&
                  (!kaapi_wsqueue_empty(& own_queue->td_wsdeque)
 #if LIBOMP_USE_AFFINITY
-                  ||
-                  !kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->queue) ||
-                  !kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][local_node]->private_queue)
+                  || !kaapi_wsqueue_empty( &threads_data[tid].td.td_tasks_queues[KMP_LEVEL_NUMA]->qd.td_wsdeque )
+                  || !kaapi_wsqueue_empty( &threads_data[tid].td.td_tasks_queues[KMP_LEVEL_CORE]->qd.td_wsdeque )
 #endif
                 )
             )
@@ -2461,6 +2391,7 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
         }
     }
 
+#if LIBOMP_USE_AFFINITY
     /*Reset team hierarchical queues.*/
     /*TODO PV move this to the team creation!!!*/
     kmp_affin_mask_t *all_cpus;
@@ -2468,7 +2399,6 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
     if (!task_team->tt.tt_nodes_mask)
       KMP_CPU_ALLOC(task_team->tt.tt_nodes_mask);
 
-
     for (i = 0; i < nthreads; i++) {
       kmp_base_thread_data_t *td = &threads_data[i].td;
       int physical_core = KMP_CPU_FIRST(threads_data[i].td.td_thr->th.th_affin_mask);
@@ -2478,9 +2408,18 @@ __kmp_enable_tasking( kmp_task_team_t *task_team, kmp_info_t *this_thr )
       td->td_tasks_queues[KMP_LEVEL_NUMA] = __kmp_node_queue_of_core(physical_core);
       td->td_tasks_queues[KMP_LEVEL_MACHINE] = __kmp_machine_queue();
       KA_TRACE( 10, ( "__kmp_enable_tasking: Using physical_core %u for thread %u\n", physical_core, i ) );
-
     }
+    /* TODO TG: move code to team creation - allocate new queues and does not share queues between several team */
     hwloc_cpuset_to_nodeset(__kmp_hwloc_topology, (hwloc_cpuset_t)all_cpus, (hwloc_nodeset_t)task_team->tt.tt_nodes_mask);
+    task_team->tt.tt_nodes_count = hwloc_bitmap_weight( (hwloc_nodeset_t)task_team->tt.tt_nodes_mask );
+    task_team->tt.tt_nodes_tasks_queues =  (kmp_queue_data_t**)__kmp_allocate( task_team->tt.tt_nodes_count * sizeof(kmp_queue_data_t*));
+    hwloc_nodeset_t tmp =  hwloc_bitmap_dup( (hwloc_nodeset_t)task_team->tt.tt_nodes_mask );
+    for (i = 0; i < task_team->tt.tt_nodes_count; i++) {
+      int index = hwloc_bitmap_first( tmp );
+      hwloc_bitmap_clr( tmp, index );
+      task_team->tt.tt_nodes_tasks_queues[i] = __kmp_queue_of_node( index );
+    }
+#endif
 
     KA_TRACE( 10, ( "__kmp_enable_tasking(exit): T#%d\n",
                     __kmp_gtid_from_thread( this_thr ) ) );