Commit f0b0460d authored by Berenger Bramas's avatar Berenger Bramas

Use inastemp

parent 2c5546a4
[submodule "CMakeModules/morse_cmake"]
path = CMakeModules/morse_cmake
url = https://gitlab.inria.fr/solverstack/morse_cmake.git
[submodule "inastemp"]
path = inastemp
url = https://gitlab.mpcdf.mpg.de/bbramas/inastemp.git
This diff is collapsed.
###########################################################################################
# Berenger Bramas Inria
# This goes with the getCpuInfos.cpp
# This will create one CMAKE value per output option from the cpp file.
# For example the output of the CPP file can be:
# SSE3=TRUE;AVX=FALSE
# Then it will create:
# CPUOPTION_SSE3 = TRUE
# CPUOPTION_AVX = FALSE
#
# The binary should return 0 on success.
###########################################################################################
macro(GetCpuInfos)
# The original CPP file
set(GetCpuInfosFile "${PROJECT_SOURCE_DIR}/CMakeModules/getCpuInfos.cpp")
# Fatal error if the file does not exist
if(NOT EXISTS ${GetCpuInfosFile})
message(FATAL_ERROR "The GetCpuInfosFile does not exist (${GetCpuInfosFile})")
endif()
# Compile and execute the file
try_run(RUN_RESULT_VAR COMPILE_RESULT_VAR
${CMAKE_BINARY_DIR} ${GetCpuInfosFile} # [CMAKE_FLAGS <Flags>] [COMPILE_DEFINITIONS <flags>]
COMPILE_OUTPUT_VARIABLE comp
RUN_OUTPUT_VARIABLE run)
# If it has successfuly compiled an run
if(COMPILE_RESULT_VAR AND (RUN_RESULT_VAR EQUAL 0) )
set( CPU_OPTIONS ${run} )
# For each value
foreach(optionNode ${run})
# Get name and value
string(REPLACE "=" ";" optionNameAndValue ${optionNode})
list(LENGTH optionNameAndValue optionLength)
# If we get both
if(optionLength EQUAL 2)
list(GET optionNameAndValue 0 optionName)
list(GET optionNameAndValue 1 optionValue)
# create cmake variable
set(CPUOPTION_${optionName} ${optionValue})
else()
message(WARNING "GetCpuInfosFile wrong format for ${optionNode}.")
endif()
endforeach()
# output the sentence from the binrary
message(STATUS "CPUOPTION : ${CPU_OPTIONS}")
else()
message(WARNING "GetCpuInfosFile did not return correctly.")
endif()
endmacro(GetCpuInfos)
#include "immintrin.h"
int main() {
#ifdef __MIC__
__m512 tx, ty ;
tx += ty ;
#endif
return 0;
}
#include "immintrin.h"
int main() {
__m256d tx, ty ;
tx += ty ;
return 0;
}
#include <xmmintrin.h> // SSE
#include <emmintrin.h> //SSE2
#include <pmmintrin.h> //SSE3
#ifdef __SSSE3__
#include <tmmintrin.h> //SSSE3
#endif
#ifdef __SSSE4_1__
#include <smmintrin.h> // SSE4
#endif
int main() {
__m128d tx, ty ;
tx += ty ;
return 0;
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
#include <immintrin.h> // AVX
int main(){
{
__m256d res0d, res1d;
res0d = _mm256_hadd_pd(res0d, res1d);
__m256 res0, res1;
res0 = _mm256_hadd_ps(res0, res1);
}
{
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
}
return 0;
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#include <tmmintrin.h> // SSSE3
#include <smmintrin.h> // SSE4
#include <immintrin.h> // AVX
int main(){
{
#ifdef __MIC__
__m512d res0d, res1d;
res0d = _mm512_hadd_pd(res0d, res1d);
__m512 res0, res1;
res0 = _mm512_hadd_ps(res0, res1);
#endif
}
{
__m256d res0d, res1d;
res0d = _mm256_hadd_pd(res0d, res1d);
__m256 res0, res1;
res0 = _mm256_hadd_ps(res0, res1);
}
{
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
}
return 0;
}
int main(){
int i ;
#ifdef __INTEL_COMPILER
i = 0;
#else
#error 'Not Intel Compiler "
#endif
}
#include <x86intrin.h>
#include <xmmintrin.h> // SSE
#include <emmintrin.h> // SSE2
#include <pmmintrin.h> // SSE3
#ifdef __SSSE3__
#include <tmmintrin.h> //SSSE3
#endif
#ifdef __SSSE4_1__
#include <smmintrin.h> // SSE4
#endif
int main(){
__m128d res0d, res1d;
res0d = _mm_hadd_pd(res0d, res1d);
__m128 res0, res1;
res0 = _mm_hadd_ps(res0, res1);
return 0;
}
This diff is collapsed.
......@@ -30,7 +30,7 @@ The following are optional:
### Get and Build ScalFMM
To use last development states of ScalFMM, please clone the develop
branch. Note that ScalFMM contains a git submodule `morse_cmake`.
branch. Note that ScalFMM contains two git submodules `morse_cmake` and `inastemp`.
To get sources please use these commands:
``` bash
git clone --recursive git@gitlab.inria.fr:solverstack/ScalFMM.git -b develop
......
......@@ -95,13 +95,13 @@ public:
// apply P2L
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
for (unsigned int m = 0; m<FBase::nnodes; ++m) {
ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
ComputeClass XX = ComputeClass(X[m].getX());
ComputeClass XY = ComputeClass(X[m].getY());
ComputeClass XZ = ComputeClass(X[m].getZ());
std::size_t idxPart = 0;
// Compute using vectorization for all but the last array elements
ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
ComputeClass tmpLocalExp = ComputeClass(0);
for (;
idxPart < ((particles->getNbParticles())
/ FRealCount);
......@@ -114,7 +114,7 @@ public:
* physicalValues[idxPart];
}
local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
local->get(idxRhs)[m] += ComputeClass(tmpLocalExp);
// Compute the last array elements one by one if they exist
if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
......@@ -172,11 +172,11 @@ public:
for (unsigned int n=0; n<FBase::nnodes; ++n){
ComputeClass MultipoleExpansion =
FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
ComputeClass(pole->get(idxRhs)[n]);
ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
ComputeClass YX = ComputeClass(Y[n].getX());
ComputeClass YY = ComputeClass(Y[n].getY());
ComputeClass YZ = ComputeClass(Y[n].getZ());
for(std::size_t idxPart = 0;
idxPart < ( (particles->getNbParticles() + FRealCount - 1)
......
......@@ -140,11 +140,11 @@ public:
// apply P2L
for(int idxRhs = 0 ; idxRhs < NVALS ; ++idxRhs){
for (unsigned int m = 0; m < FBase::nnodes; ++m) {
ComputeClass XX = FMath::ConvertTo<ComputeClass>(X[m].getX());
ComputeClass XY = FMath::ConvertTo<ComputeClass>(X[m].getY());
ComputeClass XZ = FMath::ConvertTo<ComputeClass>(X[m].getZ());
ComputeClass XX = ComputeClass(X[m].getX());
ComputeClass XY = ComputeClass(X[m].getY());
ComputeClass XZ = ComputeClass(X[m].getZ());
ComputeClass tmpLocalExp = FMath::Zero<ComputeClass>();
ComputeClass tmpLocalExp = ComputeClass(0);
// Compute using vectorization for all but the last array elements
std::size_t idxPart = 0;
for (; idxPart < (particles->getNbParticles() / FRealCount);
......@@ -157,7 +157,7 @@ public:
* physicalValues[idxPart];
}
local->get(idxRhs)[m] += FMath::ConvertTo<FReal>(tmpLocalExp);
local->get(idxRhs)[m] += FReal(tmpLocalExp);
// Compute the last array elements one by one if they exist
if(idxPart < ((particles->getNbParticles() + FRealCount - 1) / FRealCount)) {
......@@ -221,11 +221,11 @@ public:
for (unsigned int n=0; n<FBase::nnodes; ++n){
ComputeClass MultipoleExpansion =
FMath::ConvertTo<ComputeClass, FReal>(pole->get(idxRhs)[n]);
ComputeClass(pole->get(idxRhs)[n]);
ComputeClass YX = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getX());
ComputeClass YY = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getY());
ComputeClass YZ = FMath::ConvertTo<ComputeClass, FReal>(Y[n].getZ());
ComputeClass YX = ComputeClass(Y[n].getX());
ComputeClass YY = ComputeClass(Y[n].getY());
ComputeClass YZ = ComputeClass(Y[n].getZ());
for(std::size_t idxPart = 0;
idxPart < ( (particles->getNbParticles() + FRealCount - 1)
......
......@@ -89,7 +89,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffx = (xt-xs);
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
return FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
return ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
}
// evaluate interaction (blockwise)
......@@ -110,7 +110,7 @@ struct FInterpMatrixKernelR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffx = (xt-xs);
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass one_over_r = FMath::One<ValueClass>() / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
const ValueClass one_over_r = ValueClass(1) / FMath::Sqrt(diffx*diffx + diffy*diffy + diffz*diffz);
const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
......@@ -176,9 +176,9 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
const ValueClass diffx = (xt-xs);
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
return FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
return ValueClass(1) / FMath::Sqrt(ValueClass(LX)*diffx*diffx +
ValueClass(LY)*diffy*diffy +
ValueClass(LZ)*diffz*diffz);
}
void setCoeff(const FReal& a, const FReal& b, const FReal& c)
{LX= a*a ; LY = b*b ; LZ = c *c;}
......@@ -208,16 +208,16 @@ struct FInterpMatrixKernelRH :FInterpMatrixKernelR<FReal>{
const ValueClass diffx = (xt-xs);
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass one_over_rL = FMath::One<ValueClass>() / FMath::Sqrt(FMath::ConvertTo<ValueClass,FReal>(LX)*diffx*diffx +
FMath::ConvertTo<ValueClass,FReal>(LY)*diffy*diffy +
FMath::ConvertTo<ValueClass,FReal>(LZ)*diffz*diffz);
const ValueClass one_over_rL = ValueClass(1) / (ValueClass(LX)*diffx*diffx +
ValueClass(LY)*diffy*diffy +
ValueClass(LZ)*diffz*diffz);
const ValueClass one_over_rL3 = one_over_rL*one_over_rL*one_over_rL;
block[0] = one_over_rL;
blockDerivative[0] = FMath::ConvertTo<ValueClass,FReal>(LX) * one_over_rL3 * diffx;
blockDerivative[1] = FMath::ConvertTo<ValueClass,FReal>(LY)* one_over_rL3 * diffy;
blockDerivative[2] = FMath::ConvertTo<ValueClass,FReal>(LZ)* one_over_rL3 * diffz;
blockDerivative[0] = ValueClass(LX) * one_over_rL3 * diffx;
blockDerivative[1] = ValueClass(LY)* one_over_rL3 * diffy;
blockDerivative[2] = ValueClass(LZ)* one_over_rL3 * diffz;
}
......@@ -283,7 +283,7 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffx = (xt-xs);
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
return FMath::One<ValueClass>() / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
return ValueClass(1) / FReal(diffx*diffx+diffy*diffy+diffz*diffz);
}
// evaluate interaction (blockwise)
......@@ -305,12 +305,12 @@ struct FInterpMatrixKernelRR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2);
const ValueClass one_over_r2 = ValueClass(1) / (r2);
const ValueClass one_over_r4 = one_over_r2*one_over_r2;
block[0] = one_over_r2;
const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_r4;
const ValueClass coef = ValueClass(-2.) * one_over_r4;
blockDerivative[0] = coef * diffx;
blockDerivative[1] = coef * diffy;
blockDerivative[2] = coef * diffz;
......@@ -382,7 +382,7 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffz = (zt-zs);
const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
const ValueClass r3 = r*r*r;
const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
//return one_over_r6 * one_over_r6;
//return one_over_r6;
return one_over_r6 * one_over_r6 - one_over_r6;
......@@ -409,12 +409,12 @@ struct FInterpMatrixKernelLJ : FInterpAbstractMatrixKernel<FReal>
const ValueClass r = FMath::Sqrt(diffx*diffx+diffy*diffy+diffz*diffz);
const ValueClass r2 = r*r;
const ValueClass r3 = r2*r;
const ValueClass one_over_r6 = FMath::One<ValueClass>() / (r3*r3);
const ValueClass one_over_r6 = ValueClass(1) / (r3*r3);
const ValueClass one_over_r8 = one_over_r6 / (r2);
block[0] = one_over_r6 * one_over_r6 - one_over_r6;
const FReal coef = FMath::ConvertTo<ValueClass,FReal>(12.0)*one_over_r6*one_over_r8 - FMath::ConvertTo<ValueClass,FReal>(6.0)*one_over_r8;
const FReal coef = ValueClass(12.0)*one_over_r6*one_over_r8 - ValueClass(6.0)*one_over_r8;
blockDerivative[0]= coef * diffx;
blockDerivative[1]= coef * diffy;
blockDerivative[2]= coef * diffz;
......@@ -493,7 +493,7 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
return FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
return ValueClass(1) / (r2 + ValueClass(CoreWidth));
}
// evaluate interaction (blockwise)
......@@ -515,13 +515,13 @@ struct FInterpMatrixKernelAPLUSRR : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2 = (diffx*diffx+diffy*diffy+diffz*diffz);
const ValueClass one_over_a_plus_r2 = FMath::One<ValueClass>() / (r2 + FMath::ConvertTo<ValueClass,FReal>(CoreWidth));
const ValueClass one_over_a_plus_r2 = ValueClass(1) / (r2 + ValueClass(CoreWidth));
const ValueClass one_over_a_plus_r2_squared = one_over_a_plus_r2*one_over_a_plus_r2;
block[0] = one_over_a_plus_r2;
// TODO Fix derivative
const ValueClass coef = FMath::ConvertTo<ValueClass,FReal>(-2.) * one_over_a_plus_r2_squared;
const ValueClass coef = ValueClass(-2.) * one_over_a_plus_r2_squared;
blockDerivative[0] = coef * diffx;
blockDerivative[1] = coef * diffy;
blockDerivative[2] = coef * diffz;
......
......@@ -120,13 +120,13 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
{
const ValueClass diff[3] = {(x1-x2),(y1-y2),(z1-z2)};
ValueClass dist2 = FMath::Zero<ValueClass>();
ValueClass dist2 = ValueClass(0.);
for(int d=0; d<3; ++d){
const ValueClass distX = diff[d] / FMath::ConvertTo<ValueClass,FReal>(lengthScale_);
const ValueClass distX = diff[d] / ValueClass(lengthScale_);
dist2 += distX*distX;
}
return FMath::Exp(FMath::ConvertTo<ValueClass,FReal>(-0.5)*dist2);
return FMath::Exp(ValueClass(-0.5)*dist2);
}
......@@ -145,7 +145,7 @@ struct FInterpMatrixKernelGauss : FAbstractCorrelationKernel<FReal>
ValueClass block[1], ValueClass blockDerivative[3]) const
{
block[0]=this->evaluate(x1,y1,z1,x2,y2,z2);
const ValueClass lengthScaleOpt = FMath::ConvertTo<ValueClass,FReal>(-1/(lengthScale_*lengthScale_));
const ValueClass lengthScaleOpt = ValueClass(-1/(lengthScale_*lengthScale_));
blockDerivative[0] = block[0]*(x1-x2) * lengthScaleOpt;
blockDerivative[1] = block[0]*(y1-y2) * lengthScaleOpt;
blockDerivative[2] = block[0]*(z1-z2) * lengthScaleOpt;
......
......@@ -161,7 +161,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
ValueClass ri,rj;
......@@ -192,7 +192,7 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2 = diffx*diffx+diffy*diffy+diffz*diffz;
const ValueClass one_over_r = FMath::One<ValueClass>()/FMath::Sqrt(r2 + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
const ValueClass one_over_r = ValueClass(1)/FMath::Sqrt(r2 + ValueClass(_CoreWidth2));
const ValueClass one_over_r3 = one_over_r*one_over_r*one_over_r;
const ValueClass r[3] = {diffx,diffy,diffz};
......@@ -219,14 +219,14 @@ struct FInterpMatrixKernel_R_IJ : FInterpAbstractMatrixKernel<FReal>
const ValueClass diffy = (yt-ys);
const ValueClass diffz = (zt-zs);
const ValueClass r2[3] = {diffx*diffx,diffy*diffy,diffz*diffz};
const ValueClass one_over_r2 = FMath::One<ValueClass>() / (r2[0] + r2[1] + r2[2] + FMath::ConvertTo<ValueClass,FReal>(_CoreWidth2));
const ValueClass one_over_r2 = ValueClass(1) / (r2[0] + r2[1] + r2[2] + ValueClass(_CoreWidth2));
const ValueClass one_over_r = FMath::Sqrt(one_over_r2);
const ValueClass one_over_r3 = one_over_r2*one_over_r;
const ValueClass r[3] = {diffx,diffy,diffz};
const ValueClass Three = FMath::ConvertTo<ValueClass,FReal>(3.);
const ValueClass MinusOne = - FMath::One<ValueClass>();
const ValueClass Three = ValueClass(3.);
const ValueClass MinusOne = - ValueClass(1);
for(unsigned int d=0;d<NCMP;++d){
unsigned int i = indexTab[d];
......
This diff is collapsed.
This diff is collapsed.
......@@ -28,7 +28,6 @@
#cmakedefine SCALFMM_BLAS_ADD_
#cmakedefine SCALFMM_BLAS_UPCASE
#cmakedefine SCALFMM_BLAS_NOCHANGE
////////////////////////////////////////////////////////
// FFT
///////////////////////////////////////////////////////
......@@ -68,20 +67,6 @@
#cmakedefine SCALFMM_USE_STARPU
#cmakedefine SCALFMM_DISABLE_NATIVE_OMP4
///////////////////////////////////////////////////////
// SSE
///////////////////////////////////////////////////////
#cmakedefine SCALFMM_USE_SSE
#cmakedefine __AVXPE_INTEL_COMPILER
///////////////////////////////////////////////////////
// AVX
///////////////////////////////////////////////////////
#cmakedefine SCALFMM_USE_AVX
#cmakedefine __SSEPE_INTEL_COMPILER
///////////////////////////////////////////////////////
// EZTRACE
///////////////////////////////////////////////////////
......
// See LICENCE file at project root
#ifndef FAVX_HPP
#define FAVX_HPP
#include "FGlobal.hpp"
#ifndef SCALFMM_USE_AVX
#error The AVX header is included while SCALFMM_USE_AVX is turned OFF
#else
#include <immintrin.h>
#ifdef __AVXPE_INTEL_COMPILER
//Side effect operators DOUBLE
inline __m256d& operator+=(__m256d & a, const __m256d & b){
return (a = _mm256_add_pd (a,b));
}
inline __m256d& operator-=(__m256d& a, const __m256d& b){
return (a = _mm256_sub_pd (a,b));
}
inline __m256d& operator*=(__m256d& a, const __m256d& b){
return (a = _mm256_mul_pd (a,b));
}
inline __m256d& operator/=(__m256d& a, const __m256d& b){
return (a = _mm256_div_pd (a,b));
}
//No side effect operators DOUBLE
inline __m256d operator+(const __m256d& a,const __m256d& b){
return _mm256_add_pd (a,b);
}
inline __m256d operator-(const __m256d& a, const __m256d& b){
return _mm256_sub_pd (a,b);
}
inline __m256d operator*(const __m256d& v1, const __m256d& v2){
return _mm256_mul_pd(v1, v2);
}
inline __m256d operator/(const __m256d& v1, const __m256d& v2){
return _mm256_div_pd(v1, v2);
}
//Side effect operators SINGLE
inline __m256& operator+=(__m256 & a, const __m256 & b){
return (a = _mm256_add_ps (a,b));
}
inline __m256& operator-=(__m256& a, const __m256& b){
return (a = _mm256_sub_ps (a,b));
}
inline __m256& operator*=(__m256& a, const __m256& b){
return (a = _mm256_mul_ps (a,b));
}
inline __m256& operator/=(__m256& a, const __m256& b){
return (a = _mm256_div_ps (a,b));
}
//No side effect operators SINGLE
inline __m256 operator+(const __m256& a,const __m256& b){
return _mm256_add_ps (a,b);
}
inline __m256 operator-(const __m256& a, const __m256& b){
return _mm256_sub_ps (a,b);
}
inline __m256 operator*(const __m256& v1, const __m256& v2){
return _mm256_mul_ps(v1, v2);
}
inline __m256 operator/(const __m256& v1, const __m256& v2){
return _mm256_div_ps(v1, v2);
}
#endif
#endif
#endif
// See LICENCE file at project root
#ifndef FAVX2_HPP
#define FAVX2_HPP
#include "FGlobal.hpp"
#ifndef SCALFMM_USE_AVX2
#error The AVX header is included while SCALFMM_USE_AVX is turned OFF
#endif
#include <immintrin.h>
#ifdef __MIC__
//Side effect operators DOUBLE
inline __m512d& operator+=(__m512d & a, const __m512d & b){
return (a = _mm512_add_pd (a,b));
}
inline __m512d& operator-=(__m512d& a, const __m512d& b){
return (a = _mm512_sub_pd (a,b));
}
inline __m512d& operator*=(__m512d& a, const __m512d& b){
return (a = _mm512_mul_pd (a,b));
}
inline __m512d& operator/=(__m512d& a, const __m512d& b){
return (a = _mm512_div_pd (a,b));
}
//No side effect operators DOUBLE
inline __m512d operator+(const __m512d& a,const __m512d& b){
return _mm512_add_pd (a,b);
}
inline __m512d operator-(const __m512d& a, const __m512d& b){
return _mm512_sub_pd (a,b);
}
inline __m512d operator*(const __m512d& v1, const __m512d& v2){
return _mm512_mul_pd(v1, v2);
}
inline __m512d operator/(const __m512d& v1, const __m512d& v2){
return _mm512_div_pd(v1, v2);
}
//Side effect operators SINGLE
inline __m512& operator+=(__m512 & a, const __m512 & b){
return (a = _mm512_add_ps (a,b));
}
inline __m512& operator-=(__m512& a, const __m512& b){
return (a = _mm512_sub_ps (a,b));
}
inline __m512& operator*=(__m512& a, const __m512& b){
return (a = _mm512_mul_ps (a,b));
}
inline __m512& operator/=(__m512& a, const __m512& b){
return (a = _mm512_div_ps (a,b));
}
//No side effect operators SINGLE
inline __m512 operator+(const __m512& a,const __m512& b){
return _mm512_add_ps (a,b);
}
inline __m512 operator-(const __m512& a, const __m512& b){
return _mm512_sub_ps (a,b);
}
inline __m512 operator*(const __m512& v1, const __m512& v2){
return _mm512_mul_ps(v1, v2);
}
inline __m512 operator/(const __m512& v1, const __m512& v2){
return _mm512_div_ps(v1, v2);
}
#endif
#endif
This diff is collapsed.