...
 
Commits (76)
......@@ -133,7 +133,7 @@ Options for all Libraries
Options for ``libomp``
----------------------
**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64``
**LIBOMP_ARCH** = ``aarch64|arm|i386|mic|mips|mips64|ppc64|ppc64le|x86_64|riscv64``
The default value for this option is chosen based on probing the compiler for
architecture macros (e.g., is ``__x86_64__`` predefined by compiler?).
......@@ -141,10 +141,6 @@ Options for ``libomp``
Intel(R) Many Integrated Core Architecture (Intel(R) MIC Architecture) to
build for. This value is ignored if **LIBOMP_ARCH** does not equal ``mic``.
**LIBOMP_OMP_VERSION** = ``50|45|40|30``
OpenMP version to build for. Older versions will disable certain
functionality and entry points.
**LIBOMP_LIB_TYPE** = ``normal|profile|stubs``
Library type can be ``normal``, ``profile``, or ``stubs``.
......@@ -192,9 +188,9 @@ Optional Features
multi-node systems where a small ``CACHE_LINE`` setting leads to false sharing.
**LIBOMP_OMPT_SUPPORT** = ``ON|OFF``
Include support for the OpenMP Tools Interface (OMPT).
This option is supported and ``ON`` by default for x86, x86_64, AArch64, and
PPC64 on Linux* and macOS*.
Include support for the OpenMP Tools Interface (OMPT).
This option is supported and ``ON`` by default for x86, x86_64, AArch64,
PPC64 and RISCV64 on Linux* and macOS*.
This option is ``OFF`` if this feature is not supported for the platform.
**LIBOMP_OMPT_OPTIONAL** = ``ON|OFF``
......@@ -225,9 +221,6 @@ These flags are **appended**, they do not overwrite any of the preset flags.
**LIBOMP_CPPFLAGS** = <space-separated flags>
Additional C preprocessor flags.
**LIBOMP_CFLAGS** = <space-separated flags>
Additional C compiler flags.
**LIBOMP_CXXFLAGS** = <space-separated flags>
Additional C++ compiler flags.
......@@ -325,12 +318,12 @@ Advanced Builds with Various Options
$ cmake -DCMAKE_C_COMPILER=icc -DCMAKE_CXX_COMPILER=icpc -DCMAKE_Fortran_COMPILER=ifort -DLIBOMP_FORTRAN_MODULES=on ..
- Have CMake find the C/C++ compiler and specify additional flags for the C
compiler, preprocessor, and C++ compiler.
- Have CMake find the C/C++ compiler and specify additional flags for the
preprocessor and C++ compiler.
.. code-blocks:: console
$ cmake -DLIBOMP_CFLAGS='-specific-flag' -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' ..
$ cmake -DLIBOMP_CPPFLAGS='-DNEW_FEATURE=1 -DOLD_FEATURE=0' -DLIBOMP_CXXFLAGS='--one-specific-flag --two-specific-flag' ..
- Build the stubs library
......
if (${OPENMP_STANDALONE_BUILD})
if (OPENMP_STANDALONE_BUILD)
# From HandleLLVMOptions.cmake
function(append_if condition value)
if (${condition})
......@@ -9,10 +9,26 @@ if (${OPENMP_STANDALONE_BUILD})
endfunction()
endif()
if (${OPENMP_ENABLE_WERROR})
# MSVC and clang-cl in compatibility mode map -Wall to -Weverything.
# TODO: LLVM adds /W4 instead, check if that works for the OpenMP runtimes.
if (NOT MSVC)
append_if(OPENMP_HAVE_WALL_FLAG "-Wall" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
endif()
if (OPENMP_ENABLE_WERROR)
append_if(OPENMP_HAVE_WERROR_FLAG "-Werror" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
endif()
# Additional warnings that are not enabled by -Wall.
append_if(OPENMP_HAVE_WCAST_QUAL_FLAG "-Wcast-qual" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_WFORMAT_PEDANTIC_FLAG "-Wformat-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_WIMPLICIT_FALLTHROUGH_FLAG "-Wimplicit-fallthrough" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_WSIGN_COMPARE_FLAG "-Wsign-compare" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
# Warnings that we want to disable because they are too verbose or fragile.
append_if(OPENMP_HAVE_WNO_EXTRA_FLAG "-Wno-extra" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_WNO_PEDANTIC_FLAG "-Wno-pedantic" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG "-Wno-maybe-uninitialized" CMAKE_C_FLAGS CMAKE_CXX_FLAGS)
append_if(OPENMP_HAVE_STD_GNUPP11_FLAG "-std=gnu++11" CMAKE_CXX_FLAGS)
if (NOT OPENMP_HAVE_STD_GNUPP11_FLAG)
append_if(OPENMP_HAVE_STD_CPP11_FLAG "-std=c++11" CMAKE_CXX_FLAGS)
......
include(CheckCCompilerFlag)
include(CheckCXXCompilerFlag)
check_c_compiler_flag(-Werror OPENMP_HAVE_WERROR_FLAG)
check_cxx_compiler_flag(-Wall OPENMP_HAVE_WALL_FLAG)
check_cxx_compiler_flag(-Werror OPENMP_HAVE_WERROR_FLAG)
# Additional warnings that are not enabled by -Wall.
check_cxx_compiler_flag(-Wcast-qual OPENMP_HAVE_WCAST_QUAL_FLAG)
check_cxx_compiler_flag(-Wformat-pedantic OPENMP_HAVE_WFORMAT_PEDANTIC_FLAG)
check_cxx_compiler_flag(-Wimplicit-fallthrough OPENMP_HAVE_WIMPLICIT_FALLTHROUGH_FLAG)
check_cxx_compiler_flag(-Wsign-compare OPENMP_HAVE_WSIGN_COMPARE_FLAG)
# Warnings that we want to disable because they are too verbose or fragile.
check_cxx_compiler_flag(-Wno-extra OPENMP_HAVE_WNO_EXTRA_FLAG)
check_cxx_compiler_flag(-Wno-pedantic OPENMP_HAVE_WNO_PEDANTIC_FLAG)
check_cxx_compiler_flag(-Wno-maybe-uninitialized OPENMP_HAVE_WNO_MAYBE_UNINITIALIZED_FLAG)
check_cxx_compiler_flag(-std=gnu++11 OPENMP_HAVE_STD_GNUPP11_FLAG)
check_cxx_compiler_flag(-std=c++11 OPENMP_HAVE_STD_CPP11_FLAG)
......@@ -78,7 +78,7 @@ endfunction()
# These flags are required to emit LLVM Bitcode. We check them together because
# if any of them are not supported, there is no point in finding out which are.
set(compiler_flags_required -emit-llvm -O1 --cuda-device-only --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
set(compiler_flags_required -emit-llvm -O1 --cuda-device-only -std=c++11 --cuda-path=${CUDA_TOOLKIT_ROOT_DIR})
set(compiler_flags_required_src "extern \"C\" __device__ int thread() { return threadIdx.x; }")
check_bitcode_compilation(LIBOMPTARGET_NVPTX_CUDA_COMPILER_SUPPORTS_FLAGS_REQUIRED "${compiler_flags_required_src}" ${compiler_flags_required})
......
......@@ -10,6 +10,7 @@
//
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
#include <stdio.h>
// Warp ID in the CUDA block
......@@ -430,9 +431,10 @@ INLINE static void* data_sharing_push_stack_common(size_t PushSize) {
}
}
// Get address from lane 0.
((int *)&FrameP)[0] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[0], 0);
int *FP = (int *)&FrameP;
FP[0] = __kmpc_impl_shfl_sync(CurActive, FP[0], 0);
if (sizeof(FrameP) == 8)
((int *)&FrameP)[1] = __SHFL_SYNC(CurActive, ((int *)&FrameP)[1], 0);
FP[1] = __kmpc_impl_shfl_sync(CurActive, FP[1], 0);
return FrameP;
}
......@@ -551,8 +553,7 @@ EXTERN void __kmpc_get_team_static_memory(int16_t isSPMDExecutionMode,
if (GetThreadIdInBlock() == 0) {
*frame = omptarget_nvptx_simpleMemoryManager.Acquire(buf, size);
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
__kmpc_impl_syncthreads();
return;
}
ASSERT0(LT_FUSSY, GetThreadIdInBlock() == GetMasterThreadID(),
......@@ -566,8 +567,7 @@ EXTERN void __kmpc_restore_team_static_memory(int16_t isSPMDExecutionMode,
if (is_shared)
return;
if (isSPMDExecutionMode) {
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
__kmpc_impl_syncthreads();
if (GetThreadIdInBlock() == 0) {
omptarget_nvptx_simpleMemoryManager.Release();
}
......
......@@ -10,8 +10,7 @@
//
// This file contains all the definitions that are relevant to
// the interface. The first section contains the interface as
// declared by OpenMP. A second section includes library private calls
// (mostly debug, temporary?) The third section includes the compiler
// declared by OpenMP. The second section includes the compiler
// specific interfaces.
//
//===----------------------------------------------------------------------===//
......@@ -211,51 +210,14 @@ typedef struct kmp_TaskDescr {
int32_t partId; // unused
kmp_TaskFctPtr destructors; // destructor of c++ first private
} kmp_TaskDescr;
// task dep defs
#define KMP_TASKDEP_IN 0x1u
#define KMP_TASKDEP_OUT 0x2u
typedef struct kmp_TaskDep_Public {
void *addr;
size_t len;
uint8_t flags; // bit 0: in, bit 1: out
} kmp_TaskDep_Public;
// flags that interpret the interface part of tasking flags
#define KMP_TASK_IS_TIED 0x1
#define KMP_TASK_FINAL 0x2
#define KMP_TASK_MERGED_IF0 0x4 /* unused */
#define KMP_TASK_DESTRUCTOR_THUNK 0x8
// flags for task setup return
#define KMP_CURRENT_TASK_NOT_SUSPENDED 0
#define KMP_CURRENT_TASK_SUSPENDED 1
// sync defs
typedef int32_t kmp_CriticalName[8];
////////////////////////////////////////////////////////////////////////////////
// flags for kstate (all bits initially off)
////////////////////////////////////////////////////////////////////////////////
// first 2 bits used by kmp_Reduction (defined in kmp_reduction.cpp)
#define KMP_REDUCTION_MASK 0x3
#define KMP_SKIP_NEXT_CALL 0x4
#define KMP_SKIP_NEXT_CANCEL_BARRIER 0x8
////////////////////////////////////////////////////////////////////////////////
// data
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
// external interface
////////////////////////////////////////////////////////////////////////////////
// query
EXTERN int32_t __kmpc_global_num_threads(kmp_Ident *loc); // missing
EXTERN int32_t __kmpc_bound_thread_num(kmp_Ident *loc); // missing
EXTERN int32_t __kmpc_bound_num_threads(kmp_Ident *loc); // missing
EXTERN int32_t __kmpc_in_parallel(kmp_Ident *loc); // missing
// parallel
EXTERN int32_t __kmpc_global_thread_num(kmp_Ident *loc);
EXTERN void __kmpc_push_num_threads(kmp_Ident *loc, int32_t global_tid,
......@@ -461,6 +423,8 @@ EXTERN void __kmpc_flush(kmp_Ident *loc);
// vote
EXTERN int32_t __kmpc_warp_active_thread_mask();
// syncwarp
EXTERN void __kmpc_syncwarp(int32_t);
// tasks
EXTERN kmp_TaskDescr *__kmpc_omp_task_alloc(kmp_Ident *loc,
......
......@@ -37,17 +37,13 @@ EXTERN void omp_set_num_threads(int num) {
PRINT(LD_IO, "call omp_set_num_threads(num %d)\n", num);
if (num <= 0) {
WARNING0(LW_INPUT, "expected positive num; ignore\n");
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
currTaskDescr->NThreads() = num;
} else if (parallelLevel[GetWarpId()] == 0) {
nThreads = num;
}
}
EXTERN int omp_get_num_threads(void) {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int rc = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
int rc = GetNumberOfOmpThreads(isSPMDMode());
PRINT(LD_IO, "call omp_get_num_threads() return %d\n", rc);
return rc;
}
......@@ -56,12 +52,10 @@ EXTERN int omp_get_max_threads(void) {
if (parallelLevel[GetWarpId()] > 0)
// We're already in parallel region.
return 1; // default is 1 thread avail
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
ASSERT0(LT_FUSSY, !currTaskDescr->InParallelRegion(),
"Should no be in the parallel region");
// Not currently in a parallel region, return what was set.
int rc = currTaskDescr->NThreads();
int rc = 1;
if (parallelLevel[GetWarpId()] == 0)
rc = nThreads;
ASSERT0(LT_FUSSY, rc >= 0, "bad number of threads");
PRINT(LD_IO, "call omp_get_max_threads() return %d\n", rc);
return rc;
......@@ -156,10 +150,7 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
int rc = -1;
// If level is 0 or all parallel regions are not active - return 0.
unsigned parLevel = parallelLevel[GetWarpId()];
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel)) {
rc = 0;
} else if (level > 0) {
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
int totLevel = omp_get_level();
if (level <= totLevel) {
omptarget_nvptx_TaskDescr *currTaskDescr =
......@@ -179,9 +170,8 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
(currTaskDescr->IsParallelConstruct() ? "par" : "task"),
(int)currTaskDescr->InParallelRegion(), (int)sched,
currTaskDescr->RuntimeChunkSize(),
(int)currTaskDescr->ThreadId(),
(int)currTaskDescr->ThreadsInTeam(),
(int)currTaskDescr->NThreads());
(int)currTaskDescr->ThreadId(), (int)threadsInTeam,
(int)nThreads);
}
if (currTaskDescr->IsParallelConstruct()) {
......@@ -196,6 +186,12 @@ EXTERN int omp_get_ancestor_thread_num(int level) {
} while (currTaskDescr);
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
}
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 0;
}
PRINT(LD_IO, "call omp_get_ancestor_thread_num(level %d) returns %d\n", level,
rc)
......@@ -208,30 +204,14 @@ EXTERN int omp_get_team_size(int level) {
int rc = -1;
unsigned parLevel = parallelLevel[GetWarpId()];
// If level is 0 or all parallel regions are not active - return 1.
if (level == 0 || (level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel)) {
if (level == 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL) {
rc = threadsInTeam;
} else if (level == 0 ||
(level > 0 && parLevel < OMP_ACTIVE_PARALLEL_LEVEL &&
level <= parLevel) ||
(level > 1 && parLevel > OMP_ACTIVE_PARALLEL_LEVEL &&
level <= (parLevel - OMP_ACTIVE_PARALLEL_LEVEL))) {
rc = 1;
} else if (level > 0) {
int totLevel = omp_get_level();
if (level <= totLevel) {
omptarget_nvptx_TaskDescr *currTaskDescr =
getMyTopTaskDescriptor(/*isSPMDExecutionMode=*/false);
int steps = totLevel - level;
ASSERT0(LT_FUSSY, currTaskDescr,
"do not expect fct to be called in a non-active thread");
do {
if (currTaskDescr->IsParallelConstruct()) {
if (!steps) {
// found the level
rc = currTaskDescr->ThreadsInTeam();
break;
}
steps--;
}
currTaskDescr = currTaskDescr->GetPrevTaskDescr();
} while (currTaskDescr);
ASSERT0(LT_FUSSY, !steps, "expected to find all steps");
}
}
PRINT(LD_IO, "call omp_get_team_size(level %d) returns %d\n", level, rc)
return rc;
......
......@@ -13,6 +13,7 @@
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
////////////////////////////////////////////////////////////////////////////////
......@@ -99,12 +100,9 @@ public:
// When IsRuntimeUninitialized is true, we assume that the caller is
// in an L0 parallel region and that all worker threads participate.
int tid = GetLogicalThreadIdInBlock(IsSPMDExecutionMode);
// Assume we are in teams region or that we use a single block
// per target region
ST numberOfActiveOMPThreads =
GetNumberOfOmpThreads(tid, IsSPMDExecutionMode);
ST numberOfActiveOMPThreads = GetNumberOfOmpThreads(IsSPMDExecutionMode);
// All warps that are in excess of the maximum requested, do
// not execute the loop
......@@ -212,7 +210,7 @@ public:
}
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(tid);
T tnum = currTaskDescr->ThreadsInTeam();
T tnum = GetNumberOfOmpThreads(checkSPMDMode(loc));
T tripCount = ub - lb + 1; // +1 because ub is inclusive
ASSERT0(LT_FUSSY, threadId < tnum,
"current thread is not needed here; error");
......@@ -383,21 +381,19 @@ public:
// Support for dispatch next
INLINE static int64_t Shuffle(unsigned active, int64_t val, int leader) {
int lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
hi = __SHFL_SYNC(active, hi, leader);
lo = __SHFL_SYNC(active, lo, leader);
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_sync(active, hi, leader);
lo = __kmpc_impl_shfl_sync(active, lo, leader);
return __kmpc_impl_pack(lo, hi);
}
INLINE static uint64_t NextIter() {
unsigned int active = __ACTIVEMASK();
int leader = __ffs(active) - 1;
int change = __popc(active);
unsigned lane_mask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lane_mask_lt));
unsigned int rank = __popc(active & lane_mask_lt);
__kmpc_impl_lanemask_t active = __ACTIVEMASK();
uint32_t leader = __kmpc_impl_ffs(active) - 1;
uint32_t change = __kmpc_impl_popc(active);
__kmpc_impl_lanemask_t lane_mask_lt = __kmpc_impl_lanemask_lt();
unsigned int rank = __kmpc_impl_popc(active & lane_mask_lt);
uint64_t warp_res;
if (rank == 0) {
warp_res = atomicAdd(
......@@ -455,7 +451,7 @@ public:
// automatically selects thread or warp ID based on selected implementation
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(tid, checkSPMDMode(loc)),
ASSERT0(LT_FUSSY, gtid < GetNumberOfOmpThreads(checkSPMDMode(loc)),
"current thread is not needed here; error");
// retrieve schedule
kmp_sched_t schedule =
......@@ -509,7 +505,7 @@ public:
PRINT(LD_LOOP,
"Got sched: active %d, total %d: lb %lld, ub %lld, stride = %lld, "
"last %d\n",
(int)GetNumberOfOmpThreads(tid, isSPMDMode()),
(int)GetNumberOfOmpThreads(isSPMDMode()),
(int)GetNumberOfWorkersInTeam(), (long long)*plower,
(long long)*pupper, (long long)*pstride, (int)*plast);
return DISPATCH_NOTFINISHED;
......@@ -782,8 +778,7 @@ EXTERN void __kmpc_reduce_conditional_lastprivate(kmp_Ident *loc, int32_t gtid,
"Expected non-SPMD mode + initialized runtime.");
omptarget_nvptx_TeamDescr &teamDescr = getMyTeamDescriptor();
int tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(tid, checkSPMDMode(loc));
uint32_t NumThreads = GetNumberOfOmpThreads(checkSPMDMode(loc));
uint64_t *Buffer = teamDescr.getLastprivateIterBuffer();
for (unsigned i = 0; i < varNum; i++) {
// Reset buffer.
......
......@@ -33,6 +33,8 @@ __device__ __shared__ uint32_t usedSlotIdx;
__device__ __shared__ uint8_t parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
__device__ __shared__ uint16_t threadLimit;
__device__ __shared__ uint16_t threadsInTeam;
__device__ __shared__ uint16_t nThreads;
// Pointer to this team's OpenMP state object
__device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
......
......@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// global data tables
......@@ -63,7 +64,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
// init team context
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/false);
currTeamDescr.InitTeamDescr();
// this thread will start execution... has to update its task ICV
// to point to the level zero task ICV. That ICV was init in
// InitTeamDescr()
......@@ -73,7 +74,7 @@ EXTERN void __kmpc_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime) {
// set number of threads and thread limit in team to started value
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
currTaskDescr->NThreads() = GetNumberOfWorkersInTeam();
nThreads = GetNumberOfWorkersInTeam();
threadLimit = ThreadLimit;
}
......@@ -106,7 +107,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
}
if (!RequiresOMPRuntime) {
// Runtime is not required - exit.
__SYNCTHREADS();
__kmpc_impl_syncthreads();
return;
}
......@@ -123,10 +124,9 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
// init team context
currTeamDescr.InitTeamDescr(/*isSPMDExecutionMode=*/true);
currTeamDescr.InitTeamDescr();
}
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
__kmpc_impl_syncthreads();
omptarget_nvptx_TeamDescr &currTeamDescr = getMyTeamDescriptor();
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
......@@ -137,8 +137,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
ASSERT0(LT_FUSSY, newTaskDescr, "expected a task descr");
newTaskDescr->InitLevelOneTaskDescr(ThreadLimit,
currTeamDescr.LevelZeroTaskDescr());
newTaskDescr->InitLevelOneTaskDescr(currTeamDescr.LevelZeroTaskDescr());
// install new top descriptor
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
newTaskDescr);
......@@ -147,7 +146,7 @@ EXTERN void __kmpc_spmd_kernel_init(int ThreadLimit, int16_t RequiresOMPRuntime,
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->ThreadsInTeam());
(int)newTaskDescr->ThreadId(), (int)ThreadLimit);
if (RequiresDataSharing && GetLaneId() == 0) {
// Warp master innitializes data sharing environment.
......@@ -169,8 +168,7 @@ EXTERN void __kmpc_spmd_kernel_deinit_v2(int16_t RequiresOMPRuntime) {
if (!RequiresOMPRuntime)
return;
// FIXME: use __syncthreads instead when the function copy is fixed in LLVM.
__SYNCTHREADS();
__kmpc_impl_syncthreads();
int threadId = GetThreadIdInBlock();
if (threadId == 0) {
// Enqueue omp state object for use by another team.
......
......@@ -48,20 +48,13 @@
// Macros for Cuda intrinsics
// In Cuda 9.0, the *_sync() version takes an extra argument 'mask'.
// Also, __ballot(1) in Cuda 8.0 is replaced with __activemask().
#if defined(CUDART_VERSION) && CUDART_VERSION >= 9000
#define __SHFL_SYNC(mask, var, srcLane) __shfl_sync((mask), (var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down_sync((mask), (var), (delta), (width))
#ifndef CUDA_VERSION
#error CUDA_VERSION macro is undefined, something wrong with cuda.
#elif CUDA_VERSION >= 9000
#define __ACTIVEMASK() __activemask()
#else
#define __SHFL_SYNC(mask, var, srcLane) __shfl((var), (srcLane))
#define __SHFL_DOWN_SYNC(mask, var, delta, width) \
__shfl_down((var), (delta), (width))
#define __ACTIVEMASK() __ballot(1)
#endif
#define __SYNCTHREADS_N(n) asm volatile("bar.sync %0;" : : "r"(n) : "memory");
#define __SYNCTHREADS() __SYNCTHREADS_N(0)
#endif // CUDA_VERSION
// arguments needed for L0 parallelism only.
class omptarget_nvptx_SharedArgs {
......@@ -164,24 +157,20 @@ public:
}
INLINE int IsTaskConstruct() const { return !IsParallelConstruct(); }
// methods for other fields
INLINE uint16_t &NThreads() { return items.nthreads; }
INLINE uint16_t &ThreadId() { return items.threadId; }
INLINE uint16_t &ThreadsInTeam() { return items.threadsInTeam; }
INLINE uint64_t &RuntimeChunkSize() { return items.runtimeChunkSize; }
INLINE omptarget_nvptx_TaskDescr *GetPrevTaskDescr() const { return prev; }
INLINE void SetPrevTaskDescr(omptarget_nvptx_TaskDescr *taskDescr) {
prev = taskDescr;
}
// init & copy
INLINE void InitLevelZeroTaskDescr(bool isSPMDExecutionMode);
INLINE void InitLevelOneTaskDescr(uint16_t tnum,
omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void InitLevelZeroTaskDescr();
INLINE void InitLevelOneTaskDescr(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void Copy(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyData(omptarget_nvptx_TaskDescr *sourceTaskDescr);
INLINE void CopyParent(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyForExplicitTask(omptarget_nvptx_TaskDescr *parentTaskDescr);
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr,
uint16_t tnum);
INLINE void CopyToWorkDescr(omptarget_nvptx_TaskDescr *masterTaskDescr);
INLINE void CopyFromWorkDescr(omptarget_nvptx_TaskDescr *workTaskDescr);
INLINE void CopyConvergentParent(omptarget_nvptx_TaskDescr *parentTaskDescr,
uint16_t tid, uint16_t tnum);
......@@ -211,9 +200,7 @@ private:
struct TaskDescr_items {
uint8_t flags; // 6 bit used (see flag above)
uint8_t unused;
uint16_t nthreads; // thread num for subsequent parallel regions
uint16_t threadId; // thread id
uint16_t threadsInTeam; // threads in current team
uint64_t runtimeChunkSize; // runtime chunk size
} items;
omptarget_nvptx_TaskDescr *prev;
......@@ -253,7 +240,7 @@ public:
INLINE uint64_t *getLastprivateIterBuffer() { return &lastprivateIterBuffer; }
// init
INLINE void InitTeamDescr(bool isSPMDExecutionMode);
INLINE void InitTeamDescr();
INLINE __kmpc_data_sharing_slot *RootS(int wid, bool IsMasterThread) {
// If this is invoked by the master thread of the master warp then intialize
......@@ -407,6 +394,8 @@ extern __device__ __shared__ uint32_t usedSlotIdx;
extern __device__ __shared__ uint8_t
parallelLevel[MAX_THREADS_PER_TEAM / WARPSIZE];
extern __device__ __shared__ uint16_t threadLimit;
extern __device__ __shared__ uint16_t threadsInTeam;
extern __device__ __shared__ uint16_t nThreads;
extern __device__ __shared__
omptarget_nvptx_ThreadPrivateContext *omptarget_nvptx_threadPrivateContext;
......
......@@ -31,7 +31,7 @@ INLINE void omptarget_nvptx_TaskDescr::SetRuntimeSched(omp_sched_t sched) {
}
INLINE void
omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr() {
// slow method
// flag:
// default sched is static,
......@@ -39,17 +39,14 @@ omptarget_nvptx_TaskDescr::InitLevelZeroTaskDescr(bool isSPMDExecutionMode) {
// not in parallel
items.flags = 0;
items.nthreads = GetNumberOfProcsInTeam(isSPMDExecutionMode);
; // threads: whatever was alloc by kernel
items.threadId = 0; // is master
items.threadsInTeam = 1; // sequential
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
}
// This is called when all threads are started together in SPMD mode.
// OMP directives include target parallel, target distribute parallel for, etc.
INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
uint16_t tnum, omptarget_nvptx_TaskDescr *parentTaskDescr) {
omptarget_nvptx_TaskDescr *parentTaskDescr) {
// slow method
// flag:
// default sched is static,
......@@ -58,10 +55,8 @@ INLINE void omptarget_nvptx_TaskDescr::InitLevelOneTaskDescr(
items.flags =
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.nthreads = 0; // # threads for subsequent parallel region
items.threadId =
GetThreadIdInBlock(); // get ids from cuda (only called for 1st level)
items.threadsInTeam = tnum;
items.runtimeChunkSize = 1; // prefered chunking statik with chunk 1
prev = parentTaskDescr;
}
......@@ -91,12 +86,11 @@ INLINE void omptarget_nvptx_TaskDescr::CopyForExplicitTask(
}
INLINE void omptarget_nvptx_TaskDescr::CopyToWorkDescr(
omptarget_nvptx_TaskDescr *masterTaskDescr, uint16_t tnum) {
omptarget_nvptx_TaskDescr *masterTaskDescr) {
CopyParent(masterTaskDescr);
// overrwrite specific items;
items.flags |=
TaskDescr_InPar | TaskDescr_IsParConstr; // set flag to parallel
items.threadsInTeam = tnum; // set number of threads
}
INLINE void omptarget_nvptx_TaskDescr::CopyFromWorkDescr(
......@@ -121,7 +115,6 @@ INLINE void omptarget_nvptx_TaskDescr::CopyConvergentParent(
omptarget_nvptx_TaskDescr *parentTaskDescr, uint16_t tid, uint16_t tnum) {
CopyParent(parentTaskDescr);
items.flags |= TaskDescr_InParL2P; // In L2+ parallelism
items.threadsInTeam = tnum; // set number of threads
items.threadId = tid;
}
......@@ -177,8 +170,8 @@ omptarget_nvptx_ThreadPrivateContext::InitThreadPrivateContext(int tid) {
// Team Descriptor
////////////////////////////////////////////////////////////////////////////////
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr(bool isSPMDExecutionMode) {
levelZeroTaskDescr.InitLevelZeroTaskDescr(isSPMDExecutionMode);
INLINE void omptarget_nvptx_TeamDescr::InitTeamDescr() {
levelZeroTaskDescr.InitLevelZeroTaskDescr();
}
////////////////////////////////////////////////////////////////////////////////
......
......@@ -55,7 +55,7 @@
////////////////////////////////////////////////////////////////////////////////
#define EXTERN extern "C" __device__
#define INLINE __inline__ __device__
#define INLINE __forceinline__ __device__
#define NOINLINE __noinline__ __device__
#ifndef TRUE
#define TRUE 1
......
......@@ -33,6 +33,7 @@
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
typedef struct ConvergentSimdJob {
omptarget_nvptx_TaskDescr taskDescr;
......@@ -48,13 +49,12 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
int32_t *LaneId, int32_t *NumLanes) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_simd\n");
uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __popc(ConvergentMask);
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __ffs(WorkRemaining);
*IsFinal = __popc(WorkRemaining) == 1;
uint32_t lanemask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
*LaneId = __popc(ConvergentMask & lanemask_lt);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
*LaneId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
......@@ -64,7 +64,7 @@ EXTERN bool __kmpc_kernel_convergent_simd(void *buffer, uint32_t Mask,
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId);
job->slimForNextSimd = SimdLimit;
int32_t SimdLimitSource = __SHFL_SYNC(Mask, SimdLimit, *LaneSource);
int32_t SimdLimitSource = __kmpc_impl_shfl_sync(Mask, SimdLimit, *LaneSource);
// reset simdlimit to avoid propagating to successive #simd
if (SimdLimitSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->SimdLimitForNextSimd(threadId) = 0;
......@@ -122,13 +122,12 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
int32_t *LaneSource) {
PRINT0(LD_IO, "call to __kmpc_kernel_convergent_parallel\n");
uint32_t ConvergentMask = Mask;
int32_t ConvergentSize = __popc(ConvergentMask);
int32_t ConvergentSize = __kmpc_impl_popc(ConvergentMask);
uint32_t WorkRemaining = ConvergentMask >> (*LaneSource + 1);
*LaneSource += __ffs(WorkRemaining);
*IsFinal = __popc(WorkRemaining) == 1;
uint32_t lanemask_lt;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
uint32_t OmpId = __popc(ConvergentMask & lanemask_lt);
*LaneSource += __kmpc_impl_ffs(WorkRemaining);
*IsFinal = __kmpc_impl_popc(WorkRemaining) == 1;
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t OmpId = __kmpc_impl_popc(ConvergentMask & lanemask_lt);
int threadId = GetLogicalThreadIdInBlock(isSPMDMode());
int sourceThreadId = (threadId & ~(WARPSIZE - 1)) + *LaneSource;
......@@ -138,7 +137,8 @@ EXTERN bool __kmpc_kernel_convergent_parallel(void *buffer, uint32_t Mask,
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
job->tnumForNextPar = NumThreadsClause;
int32_t NumThreadsSource = __SHFL_SYNC(Mask, NumThreadsClause, *LaneSource);
int32_t NumThreadsSource =
__kmpc_impl_shfl_sync(Mask, NumThreadsClause, *LaneSource);
// reset numthreads to avoid propagating to successive #parallel
if (NumThreadsSource > 0 && threadId == sourceThreadId)
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId) =
......@@ -249,8 +249,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
uint16_t &NumThreadsClause =
omptarget_nvptx_threadPrivateContext->NumThreadsForNextParallel(threadId);
uint16_t NumThreads = determineNumberOfThreads(
NumThreadsClause, currTaskDescr->NThreads(), threadLimit);
uint16_t NumThreads =
determineNumberOfThreads(NumThreadsClause, nThreads, threadLimit);
if (NumThreadsClause != 0) {
// Reset request to avoid propagating to successive #parallel
......@@ -264,7 +264,8 @@ EXTERN void __kmpc_kernel_prepare_parallel(void *WorkFn,
// Set number of threads on work descriptor.
omptarget_nvptx_WorkDescr &workDescr = getMyWorkDescriptor();
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr, NumThreads);
workDescr.WorkTaskDescr()->CopyToWorkDescr(currTaskDescr);
threadsInTeam = NumThreads;
}
// All workers call this function. Deactivate those not needed.
......@@ -294,7 +295,7 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
// Set to true for workers participating in the parallel region.
bool isActive = false;
// Initialize state for active threads.
if (threadId < workDescr.WorkTaskDescr()->ThreadsInTeam()) {
if (threadId < threadsInTeam) {
// init work descriptor from workdesccr
omptarget_nvptx_TaskDescr *newTaskDescr =
omptarget_nvptx_threadPrivateContext->Level1TaskDescr(threadId);
......@@ -307,10 +308,10 @@ EXTERN bool __kmpc_kernel_parallel(void **WorkFn,
PRINT(LD_PAR,
"thread will execute parallel region with id %d in a team of "
"%d threads\n",
(int)newTaskDescr->ThreadId(), (int)newTaskDescr->NThreads());
(int)newTaskDescr->ThreadId(), (int)nThreads);
isActive = true;
IncParallelLevel(workDescr.WorkTaskDescr()->ThreadsInTeam() != 1);
IncParallelLevel(threadsInTeam != 1);
}
return isActive;
......@@ -328,7 +329,7 @@ EXTERN void __kmpc_kernel_end_parallel() {
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(
threadId, currTaskDescr->GetPrevTaskDescr());
DecParallelLevel(currTaskDescr->ThreadsInTeam() != 1);
DecParallelLevel(threadsInTeam != 1);
}
////////////////////////////////////////////////////////////////////////////////
......@@ -367,7 +368,6 @@ EXTERN void __kmpc_serialized_parallel(kmp_Ident *loc, uint32_t global_tid) {
// - each thread becomes ID 0 in its serialized parallel, and
// - there is only one thread per team
newTaskDescr->ThreadId() = 0;
newTaskDescr->ThreadsInTeam() = 1;
// set new task descriptor as top
omptarget_nvptx_threadPrivateContext->SetTopLevelTaskDescr(threadId,
......
......@@ -15,53 +15,7 @@
#include <stdio.h>
#include "omptarget-nvptx.h"
// may eventually remove this
EXTERN
int32_t __gpu_block_reduce() {
bool isSPMDExecutionMode = isSPMDMode();
int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
int nt = GetNumberOfOmpThreads(tid, isSPMDExecutionMode);
if (nt != blockDim.x)
return 0;
unsigned tnum = __ACTIVEMASK();
if (tnum != (~0x0)) // assume swapSize is 32
return 0;
return 1;
}
EXTERN
int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
size_t reduce_size, void *reduce_data,
void *reduce_array_size, kmp_ReductFctPtr *reductFct,
kmp_CriticalName *lck) {
int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
int numthread;
if (currTaskDescr->IsParallelConstruct()) {
numthread = GetNumberOfOmpThreads(threadId, checkSPMDMode(loc));
} else {
numthread = GetNumberOfOmpTeams();
}
if (numthread == 1)
return 1;
if (!__gpu_block_reduce())
return 2;
if (threadIdx.x == 0)
return 1;
return 0;
}
EXTERN
int32_t __kmpc_reduce_combined(kmp_Ident *loc) {
return threadIdx.x == 0 ? 2 : 0;
}
EXTERN
int32_t __kmpc_reduce_simd(kmp_Ident *loc) {
return (threadIdx.x % 32 == 0) ? 1 : 0;
}
#include "target_impl.h"
EXTERN
void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
......@@ -70,16 +24,15 @@ EXTERN
void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
return __kmpc_impl_shfl_down_sync(0xFFFFFFFF, val, delta, size);
}
EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
int lo, hi;
asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
return val;
uint32_t lo, hi;
__kmpc_impl_unpack(val, lo, hi);
hi = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, hi, delta, size);
lo = __kmpc_impl_shfl_down_sync(0xFFFFFFFF, lo, delta, size);
return __kmpc_impl_pack(lo, hi);
}
INLINE static void gpu_regular_warp_reduce(void *reduce_data,
......@@ -106,18 +59,16 @@ INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
INLINE static uint32_t
gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
uint32_t lanemask_lt;
uint32_t lanemask_gt;
uint32_t size, remote_id, physical_lane_id;
physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
uint32_t lanemask_lt = __kmpc_impl_lanemask_lt();
uint32_t Liveness = __ACTIVEMASK();
uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
uint32_t logical_lane_id = __kmpc_impl_popc(Liveness & lanemask_lt) * 2;
uint32_t lanemask_gt = __kmpc_impl_lanemask_gt();
do {
Liveness = __ACTIVEMASK();
remote_id = __ffs(Liveness & lanemask_gt);
size = __popc(Liveness);
remote_id = __kmpc_impl_ffs(Liveness & lanemask_gt);
size = __kmpc_impl_popc(Liveness);
logical_lane_id /= 2;
shflFct(reduce_data, /*LaneId =*/logical_lane_id,
/*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
......@@ -147,8 +98,7 @@ static int32_t nvptx_parallel_reduce_nowait(
kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
uint32_t NumThreads =
GetNumberOfOmpThreads(BlockThreadId, isSPMDExecutionMode);
uint32_t NumThreads = GetNumberOfOmpThreads(isSPMDExecutionMode);
if (NumThreads == 1)
return 1;
/*
......@@ -197,7 +147,7 @@ static int32_t nvptx_parallel_reduce_nowait(
gpu_regular_warp_reduce(reduce_data, shflFct);
else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/__popc(Liveness),
/*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
// parallel region may enter here; return
......@@ -279,9 +229,8 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
isSPMDExecutionMode
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
isSPMDExecutionMode ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ volatile bool IsLastTeam;
......@@ -373,7 +322,7 @@ static int32_t nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
gpu_regular_warp_reduce(reduce_data, shflFct);
else // Partial warp but contiguous lanes
gpu_irregular_warp_reduce(reduce_data, shflFct,
/*LaneCount=*/__popc(Liveness),
/*LaneCount=*/__kmpc_impl_popc(Liveness),
/*LaneId=*/ThreadId % WARPSIZE);
// When we have more than [warpsize] number of threads
......@@ -473,9 +422,8 @@ EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_v2(
// In generic mode only the team master participates in the teams
// reduction because the workers are waiting for parallel work.
uint32_t NumThreads =
checkSPMDMode(loc)
? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
checkSPMDMode(loc) ? GetNumberOfOmpThreads(/*isSPMDExecutionMode=*/true)
: /*Master thread only*/ 1;
uint32_t TeamId = GetBlockIdInKernel();
uint32_t NumTeams = GetNumberOfBlocksInKernel();
__shared__ unsigned Bound;
......
......@@ -54,8 +54,7 @@ INLINE int GetOmpThreadId(int threadId,
INLINE int GetOmpTeamId(); // omp_team_num
// get OpenMP number of threads and team
INLINE int GetNumberOfOmpThreads(int threadId,
bool isSPMDExecutionMode); // omp_num_threads
INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode); // omp_num_threads
INLINE int GetNumberOfOmpTeams(); // omp_num_teams
// get OpenMP number of procs
......
......@@ -14,6 +14,8 @@
// Execution Parameters
////////////////////////////////////////////////////////////////////////////////
#include "target_impl.h"
INLINE void setExecutionParameters(ExecutionMode EMode, RuntimeMode RMode) {
execution_param = EMode;
execution_param |= RMode;
......@@ -165,18 +167,16 @@ INLINE int GetOmpThreadId(int threadId, bool isSPMDExecutionMode) {
return rc;
}
INLINE int GetNumberOfOmpThreads(int threadId, bool isSPMDExecutionMode) {
INLINE int GetNumberOfOmpThreads(bool isSPMDExecutionMode) {
// omp_num_threads
int rc;
if ((parallelLevel[GetWarpId()] & (OMP_ACTIVE_PARALLEL_LEVEL - 1)) > 1) {
int Level = parallelLevel[GetWarpId()];
if (Level != OMP_ACTIVE_PARALLEL_LEVEL + 1) {
rc = 1;
} else if (isSPMDExecutionMode) {
rc = GetNumberOfThreadsInBlock();
} else {
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(threadId);
ASSERT0(LT_FUSSY, currTaskDescr, "expected a top task descr");
rc = currTaskDescr->ThreadsInTeam();
rc = threadsInTeam;
}
return rc;
......@@ -204,25 +204,29 @@ INLINE int IsTeamMaster(int ompThreadId) { return (ompThreadId == 0); }
// Parallel level
INLINE void IncParallelLevel(bool ActiveParallel) {
unsigned tnum = __ACTIVEMASK();
int leader = __ffs(tnum) - 1;
__SHFL_SYNC(tnum, leader, leader);
if (GetLaneId() == leader) {
unsigned Active = __ACTIVEMASK();
__kmpc_impl_syncwarp(Active);
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] +=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
__SHFL_SYNC(tnum, leader, leader);
__kmpc_impl_syncwarp(Active);
}
INLINE void DecParallelLevel(bool ActiveParallel) {
unsigned tnum = __ACTIVEMASK();
int leader = __ffs(tnum) - 1;
__SHFL_SYNC(tnum, leader, leader);
if (GetLaneId() == leader) {
unsigned Active = __ACTIVEMASK();
__kmpc_impl_syncwarp(Active);
unsigned LaneMaskLt = __kmpc_impl_lanemask_lt();
unsigned Rank = __kmpc_impl_popc(Active & LaneMaskLt);
if (Rank == 0) {
parallelLevel[GetWarpId()] -=
(1 + (ActiveParallel ? OMP_ACTIVE_PARALLEL_LEVEL : 0));
__threadfence();
}
__SHFL_SYNC(tnum, leader, leader);
__kmpc_impl_syncwarp(Active);
}
////////////////////////////////////////////////////////////////////////////////
......
......@@ -11,6 +11,7 @@
//===----------------------------------------------------------------------===//
#include "omptarget-nvptx.h"
#include "target_impl.h"
////////////////////////////////////////////////////////////////////////////////
// KMP Ordered calls
......@@ -46,10 +47,8 @@ EXTERN void __kmpc_barrier(kmp_Ident *loc_ref, int32_t tid) {
__kmpc_barrier_simple_spmd(loc_ref, tid);
} else {
tid = GetLogicalThreadIdInBlock(checkSPMDMode(loc_ref));
omptarget_nvptx_TaskDescr *currTaskDescr =
omptarget_nvptx_threadPrivateContext->GetTopLevelTaskDescr(tid);
int numberOfActiveOMPThreads =
GetNumberOfOmpThreads(tid, checkSPMDMode(loc_ref));
GetNumberOfOmpThreads(checkSPMDMode(loc_ref));