From ceaf3d1bf448ae284b7e547d42edf346711ff848 Mon Sep 17 00:00:00 2001
From: Philippe Virouleau <philippe.virouleau@imag.fr>
Date: Tue, 17 Jan 2017 16:53:42 +0100
Subject: [PATCH] Improve affinity integration.

 - Polish the non-affinity push/select
 - Setup basic affinity select (only cores)
 - Setup affinity push (thread/node/data/default)
 - Remove Kaapi's affinity: no need for it anymore!

Note: right now it's still possible to end up with a deadlock if affinity(node:X) is used, we don't visit the nodes' queue yet!
Next commit will introduce this.
---
 runtime/CMakeLists.txt          |   4 -
 runtime/src/CMakeLists.txt      |   3 -
 runtime/src/kaapi_numaffinity.c | 480 --------------------------------
 runtime/src/kaapi_numaffinity.h |  91 ------
 runtime/src/kmp.h               |  12 +-
 runtime/src/kmp_affinity.cpp    |   4 +-
 runtime/src/kmp_tasking.c       | 173 +++++-------
 7 files changed, 83 insertions(+), 684 deletions(-)
 delete mode 100644 runtime/src/kaapi_numaffinity.c
 delete mode 100644 runtime/src/kaapi_numaffinity.h

diff --git a/runtime/CMakeLists.txt b/runtime/CMakeLists.txt
index 046d5bc..2109ad8 100644
--- a/runtime/CMakeLists.txt
+++ b/runtime/CMakeLists.txt
@@ -325,10 +325,6 @@ if(LIBOMP_USE_THE_AGGREGATION AND (NOT LIBOMP_USE_THEQUEUE))
   set(LIBOMP_USE_THEQUEUE TRUE)
 endif()
 
-if(LIBOMP_USE_AFFINITY AND (NOT LIBOMP_USE_THEQUEUE))
-  libomp_error_say("OpenMP affinity scheduler required T.H.E work stealing protocol")
-endif()
-
 set(LIBOMP_USE_PAPI FALSE CACHE BOOL
   "libOMP tracing based on PAPI")
 
diff --git a/runtime/src/CMakeLists.txt b/runtime/src/CMakeLists.txt
index d19fdd8..b6a0767 100644
--- a/runtime/src/CMakeLists.txt
+++ b/runtime/src/CMakeLists.txt
@@ -103,9 +103,6 @@ else()
   if (${LIBOMP_USE_THEQUEUE})
     libomp_append(LIBOMP_CFILES kaapi_sched_ccsync.c)
     libomp_append(LIBOMP_CFILES kaapi_rt.c)
-    if(${LIBOMP_USE_AFFINITY})
-    libomp_append(LIBOMP_CFILES kaapi_numaffinity.c)
-    endif()
   endif()
   # Get C++ files
   set(LIBOMP_CXXFILES
diff --git a/runtime/src/kaapi_numaffinity.c b/runtime/src/kaapi_numaffinity.c
deleted file mode 100644
index 186a252..0000000
--- a/runtime/src/kaapi_numaffinity.c
+++ /dev/null
@@ -1,480 +0,0 @@
-/*
-** xkaapi
-**
-** Copyright 2009,2010,2011,2012 INRIA.
-**
-** Contributors :
-**
-** thierry.gautier@inrialpes.fr
-**
-** This software is a computer program whose purpose is to execute
-** multithreaded computation with data flow synchronization between
-** threads.
-** 
-** This software is governed by the CeCILL-C license under French law
-** and abiding by the rules of distribution of free software.  You can
-** use, modify and/ or redistribute the software under the terms of
-** the CeCILL-C license as circulated by CEA, CNRS and INRIA at the
-** following URL "http://www.cecill.info".
-** 
-** As a counterpart to the access to the source code and rights to
-** copy, modify and redistribute granted by the license, users are
-** provided only with a limited warranty and the software's author,
-** the holder of the economic rights, and the successive licensors
-** have only limited liability.
-** 
-** In this respect, the user's attention is drawn to the risks
-** associated with loading, using, modifying and/or developing or
-** reproducing the software by the user in light of its specific
-** status of free software, that may mean that it is complicated to
-** manipulate, and that also therefore means that it is reserved for
-** developers and experienced professionals having in-depth computer
-** knowledge. Users are therefore encouraged to load and test the
-** software's suitability as regards their requirements in conditions
-** enabling the security of their systems and/or data to be ensured
-** and, more generally, to use and operate it in the same conditions
-** as regards security.
-** 
-** The fact that you are presently reading this means that you have
-** had knowledge of the CeCILL-C license and that you accept its
-** terms.
-** 
-*/
-#include "kmp.h"
-#include "kaapi_numaffinity.h"
-#include "kaapi_atomic.h"
-#include <stdio.h>
-#include <ctype.h>
-
-#if LIBOMP_USE_AFFINITY
-#include <numa.h>
-#include <numaif.h>
-#ifndef _GNU_SOURCE
-#    define _GNU_SOURCE
-#endif
-#include <sched.h> /* sched_getcpu */
-#endif
-
-/** All places as Kaapi exports: NUMA & core places only. Other entries are set to 0
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i] is the pointer to the i-th numa node.
-*/
-kaapi_place_t** kaapi_all_places[KAAPI_HWS_LEVELID_MAX] =
-{ 0, 0, 0, 0, 0 };
-
-/** sizes of each kaapi_all_places[level] */
-int kaapi_all_places_count[KAAPI_HWS_LEVELID_MAX] =
-{ 0, 0, 0, 0, 0 };
-
-/**
-*/
-char* kaapi_cpu2numa = 0;
-
-#if defined(__linux__)
-static bool kaapi_parse_range(
-    char** str,
-    unsigned int* index_low,
-    unsigned int* index_high,
-    int* stride
-);
-static bool
-kaapi_parse_unsigned_longlong(
-    char** str,
-    unsigned long long *pvalue
-);
-static bool
-kaapi_parse_unsigned_longlong(
-    char** str,
-    unsigned long long *pvalue
-);
-static bool
-kaapi_parse_unsigned_int(
-    char** str,
-    unsigned int *pvalue
-);
-static bool
-kaapi_parse_unsigned_long(
-    char** str,
-    unsigned long *pvalue
-);
-static bool
-kaapi_parse_int(
-    char** str,
-    int *pvalue
-);
-#endif
-
-
-/**
-*/
-static kaapi_atomic_t is_hwdetect_called = {0};
-int kaapi_mt_hwdetect(void)
-{
-  if (KAAPI_ATOMIC_INCR(&is_hwdetect_called) >1)
-    return EALREADY;
-
-  /* */
-  kaapi_all_places_count[KAAPI_HWS_LEVELID_MACHINE] = 1;
-  kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE] = (kaapi_place_t**)__kmp_allocate(
-       kaapi_all_places_count[KAAPI_HWS_LEVELID_MACHINE] * sizeof(kaapi_place_t*)
-  );
-  kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0] =
-      (kaapi_place_t*)__kmp_allocate(sizeof(kaapi_place_t));
-  kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0]->level = KAAPI_HWS_LEVELID_CORE;
-  kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0]->queue =
-      (kaapi_wsqueue_t*)__kmp_allocate(sizeof(kaapi_wsqueue_t));
-  kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0]->queue,
-      INITIAL_TASK_DEQUE_SIZE,
-      -1
-  );
-  kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0]->private_queue =
-      (kaapi_wsqueue_t*)__kmp_allocate(sizeof(kaapi_wsqueue_t));
-  kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_MACHINE][0]->private_queue,
-      INITIAL_TASK_DEQUE_SIZE,
-      -1
-  );
-
-  /* compute the number of cpu of the system */
-  kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE]    = __kmp_xproc;
-  kaapi_all_places[KAAPI_HWS_LEVELID_CORE] = (kaapi_place_t**)__kmp_allocate(
-       kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE] * sizeof(kaapi_place_t*)
-  );
-  kaapi_cpu2numa = (char*)__kmp_allocate( kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE] );
-  for (int i =0; i<kaapi_all_places_count[KAAPI_HWS_LEVELID_CORE]; ++i)
-  {
-#if LIBOMP_USE_AFFINITY
-    int numa_node = numa_node_of_cpu( i );
-#else
-    int numa_node = -1;
-#endif
-    kaapi_cpu2numa[i] = numa_node;
-
-    /* todo : better allocation place[i] on associated NUMA node ! */
-    kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i] =
-        (kaapi_place_t*)__kmp_allocate(sizeof(kaapi_place_t));
-    kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i]->level = KAAPI_HWS_LEVELID_CORE;
-    kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i]->queue = (kaapi_wsqueue_t*)
-#if LIBOMP_USE_AFFINITY
-      numa_alloc_onnode( sizeof(kaapi_wsqueue_t), numa_node );
-#else
-      __kmp_allocate(sizeof(kaapi_wsqueue_t));
-#endif
-    kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i]->queue,
-        INITIAL_TASK_DEQUE_SIZE, numa_node
-    );
-    kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i]->private_queue = (kaapi_wsqueue_t*)
-#if LIBOMP_USE_AFFINITY
-      numa_alloc_onnode( sizeof(kaapi_wsqueue_t), numa_node );
-#else
-      __kmp_allocate(sizeof(kaapi_wsqueue_t));
-#endif
-    kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_CORE][i]->private_queue,
-        INITIAL_TASK_DEQUE_SIZE, numa_node
-    );
-  }
-
-#if defined(__linux__)
-  char filename[sizeof("/sys/devices/system/cpu/cpu1024/topology/thread_siblings_list") + 16];
-  FILE *file;
-
-#if !defined(__MIC__)
-  /* 1/ parse numa node informations - MIC does not have it (KNC). */
-  memcpy(filename,
-      "/sys/devices/system/node/online",
-      sizeof("/sys/devices/system/node/online"));
-  file = fopen(filename,"r");
-  if (file !=0)
-  {
-    char *line = 0;
-    char *str = 0;
-    size_t len = 0;
-    kaapi_assert( getline (&line, &len, file) > 0);
-    str = line;
-
-    /* scan num '-' num */
-    unsigned int start, end;
-    kaapi_assert(kaapi_parse_range(&str, &start, &end, 0));
-    fclose( file );
-    free(line);
-
-    /* [OFFLOAD TG]extend count by the number of device memory view as the NUMA domain */
-    kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]  = 1+end-start;
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA] = (kaapi_place_t**)__kmp_allocate(
-         kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA] * sizeof(kaapi_place_t*)
-    );
-    for (int i=0; i<kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]; ++i)
-    {
-      /* todo : better allocation place[i] on i-th NUMA node ! */
-      kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i] =
-          (kaapi_place_t*)__kmp_allocate(sizeof(kaapi_place_t));
-      kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->level = KAAPI_HWS_LEVELID_NUMA;
-      kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->queue = (kaapi_wsqueue_t*)
-#if LIBOMP_USE_AFFINITY
-          numa_alloc_onnode( sizeof(kaapi_wsqueue_t), i );
-#else
-          __kmp_allocate(sizeof(kaapi_wsqueue_t));
-#endif
-      kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->queue,
-          INITIAL_TASK_DEQUE_SIZE, i
-      );
-      kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->private_queue = (kaapi_wsqueue_t*)
-#if LIBOMP_USE_AFFINITY
-         numa_alloc_onnode( sizeof(kaapi_wsqueue_t), i );
-#else
-         __kmp_allocate(sizeof(kaapi_wsqueue_t));
-#endif
-      kaapi_wsqueue_init( kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->private_queue,
-          INITIAL_TASK_DEQUE_SIZE, i
-      );
-#if 0
-      sprintf(filename, "/sys/devices/system/node/node%i/cpulist", i);
-      file = fopen(filename,"r");
-      kaapi_assert( file != 0 );
-      line = 0;
-      kaapi_assert( getline (&line, &len, file) > 0);
-      str = line;
-      fclose(file);
-
-      kaapi_cpuset_t* place_i = &affset[KAAPI_HWS_LEVELID_NUMA].cpuset[i];
-      KAAPI_CPUSET_ZERO(place_i);
-
-      unsigned int cpu;
-      kaapi_assert( kaapi_parse_oneplace(&str, &cpu, place_i) );
-      _kaapi_init_reversemap(KAAPI_HWS_LEVELID_NUMA, kaapi_default_param.map2affset, place_i);
-      free(line);
-#endif
-    }
-  }
-  else
-    printf("[kaapi] Cannot access to topology information\n");
-#else
-#error "To redo"
-  /* mic has one numa node with all cores inside */
-  unsigned int start, end;
-  start=0;
-  end=1;
-  /* [OFFLOAD TG]extend count by the number of device memory view as the NUMA domain */
-  kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]  = 1+end-start;
-  kaapi_all_places[KAAPI_HWS_LEVELID_NUMA] = (kaapi_place_t**)__kmp_allocate(
-       kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA] * sizeof(kaapi_place_t*)
-  );
-  for (int i=0; i<kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]; ++i)
-  {
-    /* todo : better allocation place[i] on i-th NUMA node ! */
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i] =
-        (kaapi_place_t*)__kmp_allocate(sizeof(kaapi_place_t));
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->level = KAAPI_HWS_LEVELID_NUMA;
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->queue = 0;
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->private_queue = 0;
-  }
-#endif
-
-#else // not if defined(__linux__)
-  kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]  = 1;
-  kaapi_all_places[KAAPI_HWS_LEVELID_NUMA] = (kaapi_place_t**)__kmp_allocate(
-       kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA] * sizeof(kaapi_place_t*)
-  );
-  for (int i=0; i<kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA]; ++i)
-  {
-    /* todo : better allocation place[i] on i-th NUMA node ! */
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i] =
-        (kaapi_place_t*)__kmp_allocate(sizeof(kaapi_place_t));
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->level = KAAPI_HWS_LEVELID_NUMA;
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->queue = 0;
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i]->private_queue = 0;
-  }
-
-#endif
-
-  return 0;
-}
-
-
-#if defined(__linux__)
-/*
- */
-static inline int is_range_delim(const int c)
-{
-  return (c == ':') || (c == '-');
-}
-
-/*
- */
-static inline int is_delim_stride(const int c)
-{
-  return (c == ':');
-}
-
-/* end of list
- */
-static inline int is_eol(const int c)
-{
-  /* end of list */
-  return (c == ',') || (c == 0);
-}
-
-/*
-*/
-static inline void eat_space(char** str)
-{
-  while ((*str !=0) && isspace ((int)**str))
-    ++*str;
-}
-
-/** Parse :
-    [!] [ low ] [ -|: [ high ] [: stride] ]
-
-    low -> num | <empty>
-    high -> num | <empty>
-    if empty detected for low, high then it set value to (unsigned int)-1.
-    If stride pointer is not null and no stride is specified, then stride is set to 1.
-    If stride pointer is null and a stride is specified, then it is an error
-    If low '-' high then, low <= high.
-    Else if format is low ':' high, then this is the OpenMP format with low:count.
- */
-static bool kaapi_parse_range(
-    char** str,
-    unsigned int* index_low,
-    unsigned int* index_high,
-    int* stride
-)
-{
-  char range_delim = ' ';
-  if (isdigit(**str))
-  {
-    if (!kaapi_parse_unsigned_int(str, index_low))
-      return false;
-  }
-  else
-  {
-    if (!is_range_delim(**str)) /* - or : */
-      return false;
-    *index_low = (unsigned int)-1;
-  }
-
-  *index_high = *index_low;
-  if (stride !=0)
-    *stride = 1;
-
-  if (is_range_delim(**str)) /* - or : */
-  {
-    range_delim = **str;
-    ++*str;
-
-    /* 1 token look ahead */
-    if (is_eol(**str))
-      *index_high = (unsigned int)-1;
-    else
-    {
-      if (!kaapi_parse_unsigned_int(str, index_high))
-        return false;
-
-      if ((stride ==0) && is_delim_stride(**str))
-        return false;
-
-      if (is_delim_stride(**str))
-      {
-        ++*str;
-        if (is_eol(**str))
-          *stride = 1;
-        if (!kaapi_parse_int(str, stride))
-          return false;
-      }
-    }
-  }
-
-  /* test if empty set (index_low-index_high:stride)
-     or *stride ==0.
-  */
-  if (range_delim == ':')  
-  {
-    if (*index_high == -1) *index_high = 1;
-    if (*index_high < 1) return false;
-    *index_high = *index_low + (*index_high -1) * (stride == 0 ? 1 : *stride);
-  }
-
-  if ((*index_low > *index_high) || ((stride !=0) && (*stride ==0)))
-    return false;
-
-  return true;
-}
-
-/* Parse an unsigned long environment varible.  Return true if one was
-   present and it was successfully parsed.  */
-static bool
-kaapi_parse_int (char** str, int *pvalue)
-{
-  unsigned long value = 0;
-  if (!kaapi_parse_unsigned_long(str, &value))
-    return false;
-  if ((value > INT_MAX) || ((long)value < INT_MIN))
-     return false;
-  *pvalue = (int)value;
-  return true;
-}
-
-/* Parse an unsigned int environment varible.  Return true if one was
-   present and it was successfully parsed.  */
-static bool
-kaapi_parse_unsigned_int(
-    char** str,
-    unsigned int *pvalue
-)
-{
-  unsigned long long value = 0;
-  if (!kaapi_parse_unsigned_longlong(str, &value))
-    return false;
-  if (value > UINT_MAX)
-     return false;
-  *pvalue = (unsigned int)value;
-  return true;
-}
-
-/* Parse an unsigned long environment varible.  Return true if one was
-   present and it was successfully parsed.  */
-static bool
-kaapi_parse_unsigned_long(
-    char** str,
-    unsigned long *pvalue
-)
-{
-  unsigned long long value = 0;
-  if (!kaapi_parse_unsigned_longlong(str, &value))
-    return false;
-  if (value > ULONG_MAX)
-     return false;
-  *pvalue = (unsigned long)value;
-  return true;
-}
-
-
-/* Parse an unsigned long long environment varible.  Return true if one was
-   present and it was successfully parsed.  */
-static bool
-kaapi_parse_unsigned_longlong(
-    char** str,
-    unsigned long long *pvalue
-)
-{
-  char *end;
-  unsigned long long value;
-
-  if (*str == 0)
-    return false;
-
-  eat_space(str);
-  if (**str == '\0')
-    goto invalid;
-
-  errno = 0;
-  value = strtoul (*str, &end, 10);
-  if (errno || value == ULLONG_MAX)
-    goto invalid;
-
-  *pvalue = (unsigned long long)value;
-  *str = end;
-  return true;
-
- invalid:
-  return false;
-}
-#endif
diff --git a/runtime/src/kaapi_numaffinity.h b/runtime/src/kaapi_numaffinity.h
deleted file mode 100644
index 571d4ab..0000000
--- a/runtime/src/kaapi_numaffinity.h
+++ /dev/null
@@ -1,91 +0,0 @@
-/*
-** xkaapi
-**
-** Copyright 2009,2010,2011,2012 INRIA.
-**
-** Contributors :
-**
-** thierry.gautier@inrialpes.fr
-**
-** This software is a computer program whose purpose is to execute
-** multithreaded computation with data flow synchronization between
-** threads.
-** 
-** This software is governed by the CeCILL-C license under French law
-** and abiding by the rules of distribution of free software.  You can
-** use, modify and/ or redistribute the software under the terms of
-** the CeCILL-C license as circulated by CEA, CNRS and INRIA at the
-** following URL "http://www.cecill.info".
-** 
-** As a counterpart to the access to the source code and rights to
-** copy, modify and redistribute granted by the license, users are
-** provided only with a limited warranty and the software's author,
-** the holder of the economic rights, and the successive licensors
-** have only limited liability.
-** 
-** In this respect, the user's attention is drawn to the risks
-** associated with loading, using, modifying and/or developing or
-** reproducing the software by the user in light of its specific
-** status of free software, that may mean that it is complicated to
-** manipulate, and that also therefore means that it is reserved for
-** developers and experienced professionals having in-depth computer
-** knowledge. Users are therefore encouraged to load and test the
-** software's suitability as regards their requirements in conditions
-** enabling the security of their systems and/or data to be ensured
-** and, more generally, to use and operate it in the same conditions
-** as regards security.
-** 
-** The fact that you are presently reading this means that you have
-** had knowledge of the CeCILL-C license and that you accept its
-** terms.
-** 
-*/
-#include "kaapi_wsprotocol.h"
-
-#if defined(__cplusplus)
-extern "C" {
-#endif
-
-
-/** \ingroup HWS
-    hierarchy level identifiers
-    Duplicated in likomp.h
- */
-typedef enum kaapi_hws_levelid
-{
-  KAAPI_HWS_LEVELID_THREAD       = 0,
-  KAAPI_HWS_LEVELID_CORE         = 1,
-  KAAPI_HWS_LEVELID_NUMA         = 2,
-  KAAPI_HWS_LEVELID_SOCKET       = 3,
-  KAAPI_HWS_LEVELID_MACHINE      = 4,
-  KAAPI_HWS_LEVELID_MAX
-} kaapi_hws_levelid_t;
-
-/** A place : queues with locks and steal context to receive steal request.
-    Allocated on page boundary and initialized by one of the threads of the place.
-*/
-typedef struct kaapi_place_t {
-  kaapi_hws_levelid_t          level;          /* kind */
-  kaapi_wsqueue_t*             queue;          /* a queue of task */
-  kaapi_wsqueue_t*             private_queue;  /* private queue to the locality domain */
-} kaapi_place_t;
-
-/** All places as Kaapi exports: NUMA & core places only. Other entries are set to 0
-    kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][i] is the pointer to the i-th numa node.
-*/
-extern kaapi_place_t** kaapi_all_places[KAAPI_HWS_LEVELID_MAX];
-
-/** sizes of each kaapi_all_places[level] */
-extern int kaapi_all_places_count[KAAPI_HWS_LEVELID_MAX];
-
-/** Map cpu -> NUMA node
-*/
-extern char* kaapi_cpu2numa;
-
-/** Should be called after kmp_aux_affinity_initialize
-*/
-extern int kaapi_mt_hwdetect(void);
-
-#if defined(__cplusplus)
-}
-#endif
diff --git a/runtime/src/kmp.h b/runtime/src/kmp.h
index 1a2fc81..8744af0 100644
--- a/runtime/src/kmp.h
+++ b/runtime/src/kmp.h
@@ -2257,9 +2257,11 @@ struct kmp_taskdata {                                 /* aligned during dynamic
     kmp_int32               td_level;                 /* task nesting level                      */
     kmp_int32               td_untied_count;          /* untied task active parts counter        */
     ident_t*                td_ident;                 /* task identifier                         */
+#if LIBOMP_USE_AFFINITY
     kmp_uint8               td_aff_kind;              // Affinity kind & tag & strict for the task
     kmp_uint8               td_aff_strict;
     kmp_uint64              td_aff_tag;
+#endif
                             // Taskwait data.
     ident_t *               td_taskwait_ident;
     kmp_uint32              td_taskwait_counter;
@@ -2322,7 +2324,7 @@ typedef enum machine_levels
 typedef struct kmp_base_thread_data {
     kmp_info_p *            td_thr;                // Pointer back to thread info
                                                    // Used only in __kmp_execute_tasks_template, maybe not avail until task is queued?
-    kmp_queue_data_t        *td_tasks_queues[KMP_LEVEL_MAX];
+    kmp_queue_data_t        *td_tasks_queues[KMP_LEVEL_MAX] = {0};
                                                    // GEH: shouldn't this be volatile since used in while-spin?
     kmp_int32               td_deque_last_stolen;  // Thread number of last successful steal
 #ifdef BUILD_TIED_TASK_STACK
@@ -2357,7 +2359,8 @@ typedef struct kmp_base_task_team {
 #if OMP_45_ENABLED
     kmp_int32               tt_found_proxy_tasks;  /* Have we found proxy tasks since last barrier */
 #endif
-#if KMP_AFFINITY_SUPPORTED
+    //TODO PV restore this once the initialization of stuff is correct!
+#if 1 || LIBOMP_USE_AFFINITY
     // Mask of NUMA nodes active for this team
     kmp_affin_mask_t        *tt_nodes_mask;
 #endif
@@ -2494,6 +2497,8 @@ typedef struct KMP_ALIGN_CACHE kmp_base_info {
     kmp_uint32           th_task_state_top;         // Top element of th_task_state_memo_stack
     kmp_uint32           th_task_state_stack_sz;    // Size of th_task_state_memo_stack
 #if LIBOMP_USE_AFFINITY
+    // The following stores the affinity set up by the last "__kmpc_omp_set_task_affinity" call
+    // so that we can set the correct affinity in the taskdata on the next task creation call
     kmp_uint8            th_aff_kind;            // Affinity kind & tag for the next task
     kmp_uint8            th_aff_strict;
     kmp_uint64           th_aff_tag;
@@ -3232,9 +3237,6 @@ extern int __kmp_aux_set_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_unset_affinity_mask_proc(int proc, void **mask);
 extern int __kmp_aux_get_affinity_mask_proc(int proc, void **mask);
 extern void __kmp_balanced_affinity( int tid, int team_size );
-#if LIBOMP_USE_AFFINITY
-extern int kaapi_mt_hwdetect(void);
-#endif
 #endif /* KMP_AFFINITY_SUPPORTED */
 
 extern void __kmp_cleanup_hierarchy();
diff --git a/runtime/src/kmp_affinity.cpp b/runtime/src/kmp_affinity.cpp
index f39a748..06d0025 100644
--- a/runtime/src/kmp_affinity.cpp
+++ b/runtime/src/kmp_affinity.cpp
@@ -4095,10 +4095,10 @@ __kmp_affinity_initialize(void)
     if (disabled) {
         __kmp_affinity_type = affinity_disabled;
     }
-#if LIBOMP_USE_AFFINITY
+    //TODO PV restore this when init is fixed
+#if 1 || LIBOMP_USE_AFFINITY
     //Also init the places;
     machine_places.init();
-    kaapi_mt_hwdetect();
 #endif
 }
 
diff --git a/runtime/src/kmp_tasking.c b/runtime/src/kmp_tasking.c
index 670a324..69d9396 100644
--- a/runtime/src/kmp_tasking.c
+++ b/runtime/src/kmp_tasking.c
@@ -32,7 +32,6 @@
 #    define _GNU_SOURCE
 #endif
 #include <sched.h> /* sched_getcpu */
-#include "kaapi_numaffinity.h"
 
 /*
 */
@@ -324,91 +323,65 @@ __kmp_push_task(kmp_int32 gtid, kmp_task_t * task )
     // Find tasking deque specific to encountering thread
     thread_data = & task_team -> tt.tt_threads_data[ tid ];
 
-    /*TODO PV: hws push!*/
-    kmp_base_queue_data_t *selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
-    KMP_DEBUG_ASSERT(selected_queue);
+    kmp_base_queue_data_t *selected_queue = NULL;
+    /*= &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;*/
+
 
-    // No lock needed since only owner can allocate
-#if LIBOMP_USE_THEQUEUE
-    kaapi_wsqueue_t* pqueue = 0;
-#if defined(__linux__)
-    int cpu = sched_getcpu();
-#else
-    int cpu = 0;
-#endif
-    int local_node = 0;
 #if LIBOMP_USE_AFFINITY
-    local_node = kaapi_cpu2numa[cpu];
-    kaapi_wsqueue_t** ppqueue = 0;
-    int isremote = 1; /* always consider 'thief' concurrent push for non own queue */
-    int select = taskdata->td_aff_kind;
-#if 0
-    /* push at least a task for the current thread */
-    if (kaapi_wsqueue_empty(&(selected_queue->td_wsdeque)) && (select !=0) && (taskdata->td_aff_strict==0))
-      select = 0;
-#endif
-    switch (select)
-    {
+    int selected_nodeid = -1;
+    int selected_coreid = thread->th.th_team_nproc;
+    switch (taskdata->td_aff_kind) {
+      /*Default case, just push to our current thread queue!*/
       case 0:
-        pqueue = &(selected_queue->td_wsdeque);
-        isremote = __kmp_get_gtid() == gtid ? 0 : 1;
-      break;
-
-      case 1: /* data */
-      case 2: /* numa id */
-      {
-        int numaid;
-        if (taskdata->td_aff_kind == 1)
-          numaid = kaapi_numa_getpage_id( (void*)taskdata->td_aff_tag );
-        else
-        {
-          numaid = (int)taskdata->td_aff_tag;
-          if (numaid >= kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA])
-            numaid %= kaapi_all_places_count[KAAPI_HWS_LEVELID_NUMA];
+        selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
+        KA_TRACE(5, ( "__kmp_push_task: pushing to own node, no affinity" ) );
+        break;
+      /*affinity "data"*/
+      case 1:
+        /*TODO PV this is a system call, we should improve this!*/
+        selected_nodeid = kaapi_numa_getpage_id( (void*)taskdata->td_aff_tag );
+        KA_TRACE(5, ( "__kmp_push_task: selected node based on mem addr: %i\n", selected_nodeid ) );
+        /*Intentional no break, we need to get the queue!*/
+      /*affinity "node" => numa id. if the numa node is not in the team, push to the first node*/
+      case 2:
+        if (taskdata->td_aff_kind == 2)
+          selected_nodeid = (int)taskdata->td_aff_tag;
+        KA_TRACE(5, ( "__kmp_push_task: selected node based on number: %i\n", selected_nodeid ) );
+        if (KMP_CPU_ISSET(selected_nodeid, task_team->tt.tt_nodes_mask)) {
+          KA_TRACE(5, ( "__kmp_push_task: node is in team!\n" ) );
+          selected_queue = &(__kmp_queue_of_node(selected_nodeid)->qd);
+        } else {
+          KA_TRACE(5, ( "__kmp_push_task: node not in team, select our own node!\n" ) );
+          selected_nodeid = -1;
+          selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_NUMA]->qd;
         }
-        isremote = (numaid != local_node);
-        if (taskdata->td_aff_strict)
-        {
-          ppqueue = &kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numaid]->private_queue;
-        } 
-        else
-        { 
-          ppqueue = &kaapi_all_places[KAAPI_HWS_LEVELID_NUMA][numaid]->queue;
-        } 
-//printf("%i push on numa %i\n", cpu, numaid);
-      } break;
-
-      case 3: /* core id ~ tid of the thread in the team (%) */
-      {
-        int coreid = (int)taskdata->td_aff_tag;
-        if (coreid >= thread->th.th_team_nproc)
-          coreid %=(int)thread->th.th_team_nproc;
-        if (taskdata->td_aff_strict)
-        { 
-          ppqueue = &kaapi_all_places[KAAPI_HWS_LEVELID_CORE][coreid]->private_queue;
-        } 
-        else
-        { 
-          ppqueue = &kaapi_all_places[KAAPI_HWS_LEVELID_CORE][coreid]->queue;
-        } 
-        isremote = (coreid != cpu);
-      } break;
-    };
-    KMP_DEBUG_ASSERT ( (ppqueue != 0) || (pqueue !=0) );
-
-    if (ppqueue)
-      pqueue = *ppqueue;
-
-    if (pqueue->deque == NULL ) { /* only for local queue */
-      kaapi_wsqueue_init( pqueue, INITIAL_TASK_DEQUE_SIZE, local_node);
+        break;
+      /*affinity "core" => tid or cpu id on the machine, todo -> clarify this*/
+      case 3:
+        selected_coreid = (int)taskdata->td_aff_tag;
+        if (selected_coreid >= thread->th.th_team_nproc)
+          selected_coreid %=(int)thread->th.th_team_nproc;
+        KA_TRACE(5, ( "__kmp_push_task: selected core based on number: %i\n", selected_coreid ) );
+        selected_queue = &(task_team->tt.tt_threads_data[selected_coreid].td.td_tasks_queues[KMP_LEVEL_CORE]->qd);
+        break;
+      default:
+        /* *should* be impossible*/
+        KMP_DEBUG_ASSERT(0);
+        break;
     }
 #else
-    pqueue = &(selected_queue->td_wsdeque);
+    /*If no affinity, we just use cores, that are always available!*/
+    selected_queue = &thread_data->td.td_tasks_queues[KMP_LEVEL_CORE]->qd;
+#endif/* USE_AFFINITY */
+    KMP_DEBUG_ASSERT(selected_queue);
+
+    // No lock needed since only owner can allocate
+#if LIBOMP_USE_THEQUEUE
+    kaapi_wsqueue_t* pqueue = &(selected_queue->td_wsdeque);
     if (pqueue->deque == NULL ) {
-      kaapi_wsqueue_init( pqueue, INITIAL_TASK_DEQUE_SIZE, -1);
+      kaapi_wsqueue_init( pqueue, INITIAL_TASK_DEQUE_SIZE, selected_nodeid);
     }
     int isremote = __kmp_get_gtid() == gtid ? 0 : 1;
-#endif
     kmp_int32 err;
     err = kaapi_wsqueue_locked_push_task(
           pqueue,
@@ -1831,12 +1804,7 @@ __kmp_remove_my_task( kmp_info_t * thread, kmp_int32 gtid, kmp_task_team_t *task
       if (taskdata) goto igotatask;
     }
     { /* scope  */
-#if defined(__linux__)
-    int cpu = sched_getcpu();
-#else
-    int cpu = 0;
-#endif
-#if LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */
+#if 0 && LIBOMP_USE_AFFINITY /* use steal in place of pop because concurrency required if no aggregation */
     int local_node = kaapi_cpu2numa[cpu];
 
 #if 0
@@ -1973,19 +1941,24 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
     taskdata = 0;
 
 
-#if LIBOMP_USE_AFFINITY
-#if defined(__linux__)
-    int cpu = sched_getcpu();
-#else
-    int cpu = 0;
-#endif
-    int local_node = kaapi_cpu2numa[cpu];
-    kmp_info_t*  thread = __kmp_threads[ gtid ];
- for (int ii=0; ii<8; ++ii)
- {
-    for (int i=0; i<8; ++i)
+/*#if LIBOMP_USE_AFFINITY*/
+/*#if defined(__linux__)*/
+    /*int cpu = sched_getcpu();*/
+/*#else*/
+    /*int cpu = 0;*/
+/*#endif*/
+    /*int local_node = kaapi_cpu2numa[cpu];*/
+    /*kmp_info_t*  thread = __kmp_threads[ gtid ];*/
+    for (int ii=0; ii<8; ++ii)
     {
-       sched_yield();
+      for (int i=0; i<8; ++i)
+      {
+        sched_yield();
+        if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque)))
+          taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) );
+      }
+    }
+#if 0
 #if 0
       if (!kaapi_wsqueue_empty(kaapi_all_places[KAAPI_HWS_LEVELID_CORE][cpu]->private_queue))
       {
@@ -2046,8 +2019,7 @@ __kmp_steal_task( kmp_info_t *victim, kmp_int32 gtid, kmp_task_team_t *task_team
     }
 #endif
 #endif
-    if (!kaapi_wsqueue_empty(&(victim_queue->td_wsdeque)))
-      taskdata = kaapi_wsqueue_steal_task( &(victim_queue->td_wsdeque) );
+/*#endif*//*old USE_AFFINITY*/
 return_from:
     if (taskdata ==0) return 0;
     task = KMP_TASKDATA_TO_TASK( taskdata );
@@ -2069,7 +2041,7 @@ return_from:
 
     KMP_COUNT_BLOCK(TASK_stolen);
     return task;
-#else
+#else //LIBOMP_USE_THEQUEUE
     KA_TRACE(10, ("__kmp_steal_task(enter): T#%d try to steal from T#%d: task_team=%p ntasks=%d "
                   "head=%u tail=%u\n",
                   gtid, __kmp_gtid_from_thread( victim ), task_team, victim_queue->td_deque_ntasks,
@@ -2232,7 +2204,10 @@ static inline int __kmp_execute_tasks_template(kmp_info_t *thread, kmp_int32 gti
 #if 1//!LIBOMP_USE_THEQUEUE /* with THE queue, always use random ws */
                 // Try to steal from the last place I stole from successfully.
                 if (victim == -2) { // haven't stolen anything yet
-                    victim = own_queue->td_deque_last_stolen;
+                    if (own_queue)
+                        victim = own_queue->td_deque_last_stolen;
+                    else
+                        victim = -1;
                     if (victim != -1) // if we have a last stolen from victim, get the thread
                         other_thread = threads_data[victim].td.td_thr;
                 }
-- 
GitLab