Commit 1b55a9ff authored by rue's avatar rue

add 512 AVX2 intrinsics calls and FMAdd operator

parent a332bdd8
......@@ -65,6 +65,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
option( SCALFMM_USE_ADDONS "Set to ON to compile add ons" OFF )
option( SCALFMM_USE_SSE "Set to ON to compile with SSE support (and use intrinsec SSE P2P)" ON )
option( SCALFMM_USE_AVX "Set to ON to compile with AVX support (and use intrinsec AVX P2P)" OFF )
option( SCALFMM_USE_AVX2 "Set to ON to compile with AVX support (and use intrinsec AVXZ P2P)" OFF )
option( SCALFMM_USE_ASSERT "Set to ON to enable safe tests during execution" ON )
option( SCALFMM_USE_MIC_NATIVE "Set to ON to compile in native mode for MIC" OFF )
option( SCALFMM_ONLY_DEVEL "Set to ON to compile Development tools (only scalfmm team)" ON )
......@@ -112,6 +113,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
set(SSE_FLAGS "-msse4 -mfpmath=sse") # -mtune=native -march=native
else(APPLE)
set(AVX_FLAGS "-mavx")
set(AVX2_FLAGS "-mavx2")
set(SSE_FLAGS "-axSSE4.2")
endif(APPLE)
#-Wshadow -Wpointer-arith -Wcast-qual -Wconversion -Wall -Wnosign-conversion ")
......@@ -131,6 +133,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
endif()
endif()
set(AVX_FLAGS "-mtune=native -march=native")
set(AVX2_FLAGS "-mtune=native -march=native -mmic")
IF (APPLE)
set(SSE_FLAGS "-msse3 -mfpmath=sse") # -mtune=native -march=native
else(APPLE)
......@@ -545,6 +548,42 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_SOURCE_DIR}/CMakeModules/morse/")
message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX BUT NOT BOTH. ")
endif(SCALFMM_USE_AVX AND SCALFMM_USE_SSE)
##################################################################
# Use AVX2 #
##################################################################
message(STATUS "SCALFMM_USE_AVX2 = ${SCALFMM_USE_AVX2}")
if(SCALFMM_USE_AVX2)
if(NOT EXISTS ${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp)
message(FATAL_ERROR "The CompileTestSseFile does not exist (${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx.cpp)" )
endif()
try_compile(COMPILE_AVX2 ${CMAKE_CURRENT_BINARY_DIR}
${SCALFMM_CMAKE_MODULE_PATH}/compileTestAvx2.cpp
COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}"
OUTPUT_VARIABLE COMPILE_AVX2_OUTPUT)
if(${COMPILE_AVX2})
set(SCALFMM_CXX_FLAGS "${SCALFMM_CXX_FLAGS} ${AVX2_FLAGS}")
#set( SCALFMM_USE_SSE OFF FORCE) # ne marche pas
try_compile(COMPILE_RESULT_AVSPE ${CMAKE_CURRENT_BINARY_DIR}
${SCALFMM_CMAKE_MODULE_PATH}/checkAVX2pe.cpp
COMPILE_DEFINITIONS "${CMAKE_CXX_FLAGS} ${AVX2_FLAGS}")
if( NOT ${COMPILE_RESULT_AVSPE})
set(__AVX2PE_INTEL_COMPILER ON)
endif()
message(STATUS ${CMAKE_CXX_FLAGS} )
else(${COMPILE_AVX2})
message(FATAL_ERROR "AVX2 NOT SUPPORTED ; Set SCALFMM_USE_AVX2 to OFF \n Output from test is : ${COMPILE_AVX_OUTPUT} ")
endif(${COMPILE_AVX2})
endif(SCALFMM_USE_AVX2)
list(APPEND FUSE_LIST "AVX2")
#
# Error if both SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE are set
#
if( SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
message(FATAL_ERROR "Check SCALFMM_USE_SSE or SCALFMM_USE_AVX2 BUT NOT BOTH. ")
endif(SCALFMM_USE_AVX2 AND SCALFMM_USE_SSE)
##################################################################
# Use native MIC compilation #
##################################################################
if( SCALFMM_USE_MIC_NATIVE )
......
......@@ -474,12 +474,12 @@ static void GenericFullMutual(ContainerClass* const FRestrict inTargets, Contain
tfx += dKxy[0];
tfy += dKxy[1];
tfz += dKxy[2];
tpo += Kxy[0] * sourcesPhysicalValues[idxSource];
tpo = FMath::FMAdd(Kxy[0],sourcesPhysicalValues[idxSource],tpo);
sourcesForcesX[idxSource] -= dKxy[0];
sourcesForcesY[idxSource] -= dKxy[1];
sourcesForcesZ[idxSource] -= dKxy[2];
sourcesPotentials[idxSource] += Kxy[0] * tv;
sourcesPotentials[idxSource] = FMath::FMAdd(Kxy[0],tv,sourcesPotentials[idxSource]);
}
targetsForcesX[idxTarget] += FMath::ConvertTo<FReal, ComputeClass>(tfx);
......@@ -610,6 +610,36 @@ struct FP2PT<float>{
FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
}
};
#elif defined(SCALFMM_USE_AVX2)
template <>
struct FP2PT<double>{
template <class ContainerClass, class MatrixKernelClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
FP2P::GenericFullMutual<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
}
template <class ContainerClass, class MatrixKernelClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
FP2P::GenericFullRemote<double, ContainerClass, MatrixKernelClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
}
};
template <>
struct FP2PT<float>{
template <class ContainerClass, class MatrixKernelClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
FP2P::GenericFullMutual<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
}
template <class ContainerClass, class MatrixKernelClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors, const MatrixKernelClass *const MatrixKernel){
FP2P::GenericFullRemote<float, ContainerClass, MatrixKernelClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors, MatrixKernel);
}
};
#elif defined(SCALFMM_USE_SSE)
template <>
struct FP2PT<double>{
......
// ===================================================================================
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// Copyright SCALFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
......@@ -339,6 +339,36 @@ struct FP2PRT<float>{
FP2PR::GenericFullRemote<float, ContainerClass, __m256, 8>(inTargets, inNeighbors, limiteNeighbors);
}
};
#elif defined(SCALFMM_USE_AVX2)
template <>
struct FP2PRT<double>{
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
FP2PR::GenericFullMutual<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
}
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
FP2PR::GenericFullRemote<double, ContainerClass, __m512d, 8>(inTargets, inNeighbors, limiteNeighbors);
}
};
template <>
struct FP2PRT<float>{
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
FP2PR::GenericFullMutual<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
}
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
FP2PR::GenericFullRemote<float, ContainerClass, __m512, 16>(inTargets, inNeighbors, limiteNeighbors);
}
};
#elif defined(SCALFMM_USE_SSE)
template <>
......
......@@ -80,8 +80,13 @@ typedef long long MortonIndex;
#define Prefetch_Write(X) __builtin_prefetch(X,1,1)
#else
#ifdef __INTEL_COMPILER
#define Prefetch_Read(X) _mm_prefetch(X,_MM_HINT_T0)
#define Prefetch_Write(X) _mm_prefetch(X,_MM_HINT_T0)
#ifdef ScalFMM_USE_AVX2
#define Prefetch_Read(X) _mm512_prefetch(X,_MM_HINT_T0)
#define Prefetch_Write(X) _mm512_prefetch(X,_MM_HINT_T0)
#else
#define Prefetch_Read(X) _mm_prefetch(X,_MM_HINT_T0)
#define Prefetch_Write(X) _mm_prefetch(X,_MM_HINT_T0)
#endif
#else
#warning compiler is not defined
#define Prefetch_Read(X)
......
......@@ -71,6 +71,17 @@ struct FMath{
static __m256d Abs(const __m256d inV){
return _mm256_max_pd(_mm256_sub_pd(_mm256_setzero_pd(), inV), inV);
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
static __m512 Abs(const __m512 inV){
return _mm512_max_ps(_mm512_sub_ps(_mm512_setzero_ps(), inV), inV);
}
static __m512d Abs(const __m512d inV){
return _mm512_max_pd(_mm512_sub_pd(_mm512_setzero_pd(), inV), inV);
}
#endif
#endif
/** To get max between 2 values */
......@@ -115,7 +126,23 @@ struct FMath{
return _mm256_min_pd(inV1, inV2);
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
static __m512 Max(const __m512 inV1, const __m512 inV2){
return _mm512_max_ps(inV1, inV2);
}
static __m512 Min(const __m512 inV1, const __m512 inV2){
return _mm512_min_ps(inV1, inV2);
}
static __m512d Max(const __m512d inV1, const __m512d inV2){
return _mm512_max_pd(inV1, inV2);
}
static __m512d Min(const __m512d inV1, const __m512d inV2){
return _mm512_min_pd(inV1, inV2);
}
#endif
#endif
/** To know if 2 values seems to be equal */
template <class NumType>
static bool LookEqual(const NumType inV1, const NumType inV2){
......@@ -181,7 +208,25 @@ struct FMath{
return _mm256_ceil_pd(inV);
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
static __m512 dfloor(const __m512 inV){
return _mm512_floor_ps(inV);
}
static __m512d dfloor(const __m512d inV){
return _mm512_floor_pd(inV);
}
static __m512 Ceil(const __m512 inV){
return _mm512_ceil_ps(inV);
}
static __m512d Ceil(const __m512d inV){
return _mm512_ceil_pd(inV);
}
#endif
#endif
/** To get pow */
static double pow(double x, double y){
return ::pow(x,y);
......@@ -217,7 +262,46 @@ struct FMath{
static bool Between(const NumType inValue, const NumType inMin, const NumType inMax){
return ( inMin <= inValue && inValue < inMax );
}
/** To compute fmadd operations **/
template <class NumType>
static NumType FMAdd(const NumType a, const NumType b, const NumType c){
return a * b + c;
}
#if defined(SCALFMM_USE_SSE ) && defined(__SSSE4_1__)
static __m128 FMAdd(const __m128 inV1, const __m128 inV2, const __m128 inV3){
return _mm_add_ps( _mm_mul_ps(inV1,inV2), inV3);
}
static __m128d FMAdd(const __m128d inV1, const __m128d inV2, const __m128d inV3){
return _mm_add_pd( _mm_mul_pd(inV1,inV2), inV3);
}
#endif
#ifdef SCALFMM_USE_AVX
static __m256 FMAdd(const __m256 inV1, const __m256 inV2, const __m256 inV3){
return _mm256_add_ps( _mm256_mul_ps(inV1,inV2), inV3);
}
static __m256d FMAdd(const __m256d inV1, const __m256d inV2, const __m256d inV3){
return _mm256_add_pd( _mm256_mul_pd(inV1,inV2), inV3);
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
static __m512 FMAdd(const __m512 inV1, const __m512 inV2, const __m512 inV3){
//return _mm512_add_ps( _mm512_mul_ps(inV1,inV2), inV3);
return _mm512_fmadd_ps(inV1, inV2, inV3);
}
static __m512d FMAdd(const __m512d inV1, const __m512d inV2, const __m512d inV3){
//return _mm512_add_pd( _mm512_mul_pd(inV1,inV2), inV3);
return _mm512_fmadd_pd(inV1, inV2, inV3);
}
#endif
#endif
/** To get sqrt of a FReal */
static float Sqrt(const float inValue){
return sqrtf(inValue);
......@@ -265,7 +349,25 @@ struct FMath{
return _mm256_set1_pd(1.0) / _mm256_sqrt_pd(inV);
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
static __m512 Sqrt(const __m512 inV){
return _mm512_sqrt_ps(inV);
}
static __m512d Sqrt(const __m512d inV){
return _mm512_sqrt_pd(inV);
}
static __m512 Rsqrt(const __m512 inV){
return _mm512_rsqrt_ps(inV);
}
static __m512d Rsqrt(const __m512d inV){
return _mm512_set1_pd(1.0) / _mm512_sqrt_pd(inV);
}
#endif
#endif
/** To get Log of a FReal */
static float Log(const float inValue){
return logf(inValue);
......@@ -607,7 +709,63 @@ inline double FMath::ConvertTo<double,__m256d>(const __m256d val){
_mm256_store_pd(buffer, val);
return buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
#endif
#ifdef SCALFMM_USE_AVX2
#ifdef __MIC__
template <>
inline __m512 FMath::One<__m512>(){
return _mm512_set1_ps(1.0);
}
template <>
inline __m512d FMath::One<__m512d>(){
return _mm512_set1_pd(1.0);
}
template <>
inline __m512 FMath::Zero<__m512>(){
return _mm512_setzero_ps();
}
template <>
inline __m512d FMath::Zero<__m512d>(){
return _mm512_setzero_pd();
}
template <>
inline __m512 FMath::ConvertTo<__m512,__attribute__((aligned(64))) float>(const float val){
return _mm512_set1_ps(val);
}
template <>
inline __m512d FMath::ConvertTo<__m512d,__attribute__((aligned(64))) double>(const double val){
return _mm512_set1_pd(val);
}
template <>
inline __m512 FMath::ConvertTo<__m512,const __attribute__((aligned(64))) float*>(const float* val){
return _mm512_set1_ps(val[0]);
}
template <>
inline __m512d FMath::ConvertTo<__m512d,const __attribute__((aligned(64))) double*>(const double* val){
return _mm512_set1_pd(val[0]);
}
template <>
inline float FMath::ConvertTo<float,__m512>(const __m512 val){
__attribute__((aligned(64))) float buffer[16];
_mm512_store_ps(buffer, val);
return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7] + buffer[8] + buffer[9] + buffer[10] + buffer[11] + buffer[12] + buffer[13] + buffer[14] + buffer[15];
}
template <>
inline double FMath::ConvertTo<double,__m512d>(const __m512d val){
__attribute__((aligned(64))) double buffer[8];
_mm512_store_pd(buffer, val);
return buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
}
#endif
#endif
#endif //FMATH_HPP
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment