Commit 1a642434 authored by BRAMAS Berenger's avatar BRAMAS Berenger

add sse for simple precision should be tested

parent 7e9eb4a6
......@@ -102,6 +102,8 @@ public:
}
}
#else
#ifdef ScalFMM_USE_DOUBLE_PRECISION
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
......@@ -273,6 +275,182 @@ public:
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
#else
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m128 mOne = _mm_set1_ps(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+3)/4;
const __m128*const sourcesPhysicalValues = (const __m128*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m128*const sourcesX = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m128*const sourcesY = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m128*const sourcesZ = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[2];
__m128*const sourcesForcesX = (__m128*)inNeighbors[idxNeighbors]->getForcesX();
__m128*const sourcesForcesY = (__m128*)inNeighbors[idxNeighbors]->getForcesY();
__m128*const sourcesForcesZ = (__m128*)inNeighbors[idxNeighbors]->getForcesZ();
__m128*const sourcesPotentials = (__m128*)inNeighbors[idxNeighbors]->getPotentials();
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m128 tx = _mm_load1_ps(&targetsX[idxTarget]);
const __m128 ty = _mm_load1_ps(&targetsY[idxTarget]);
const __m128 tz = _mm_load1_ps(&targetsZ[idxTarget]);
const __m128 tv = _mm_load1_ps(&targetsPhysicalValues[idxTarget]);
__m128 tfx = _mm_setzero_ps();
__m128 tfy = _mm_setzero_ps();
__m128 tfz = _mm_setzero_ps();
__m128 tpo = _mm_setzero_ps();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m128 dx = sourcesX[idxSource] - tx;
__m128 dy = sourcesY[idxSource] - ty;
__m128 dz = sourcesZ[idxSource] - tz;
__m128 inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m128 inv_distance = _mm_sqrt_ps(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(16))) float buffer[4];
_mm_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
}
{
const int nbParticlesSources = (nbParticlesTargets+3)/4;
const __m128*const sourcesPhysicalValues = (const __m128*)targetsPhysicalValues;
const __m128*const sourcesX = (const __m128*)targetsX;
const __m128*const sourcesY = (const __m128*)targetsY;
const __m128*const sourcesZ = (const __m128*)targetsZ;
__m128*const sourcesForcesX = (__m128*)targetsForcesX;
__m128*const sourcesForcesY = (__m128*)targetsForcesY;
__m128*const sourcesForcesZ = (__m128*)targetsForcesZ;
__m128*const sourcesPotentials = (__m128*)targetsPotentials;
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m128 tx = _mm_load1_ps(&targetsX[idxTarget]);
const __m128 ty = _mm_load1_ps(&targetsY[idxTarget]);
const __m128 tz = _mm_load1_ps(&targetsZ[idxTarget]);
const __m128 tv = _mm_load1_ps(&targetsPhysicalValues[idxTarget]);
__m128 tfx = _mm_setzero_ps();
__m128 tfy = _mm_setzero_ps();
__m128 tfz = _mm_setzero_ps();
__m128 tpo = _mm_setzero_ps();
for(int idxSource = (idxTarget+2)/2 ; idxSource < nbParticlesSources ; ++idxSource){
__m128 dx = sourcesX[idxSource] - tx;
__m128 dy = sourcesY[idxSource] - ty;
__m128 dz = sourcesZ[idxSource] - tz;
__m128 inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m128 inv_distance = _mm_sqrt_ps(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(16))) float buffer[4];
_mm_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; idxTarget += 4){
for(int idxClose = 1 ; idxClose < 4; ++idxClose){
const int idxSource = idxTarget + idxClose;
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
FReal dy = targetsY[idxSource] - targetsY[idxTarget];
FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * targetsPhysicalValues[idxSource];
targetsForcesX[idxSource] -= dx;
targetsForcesY[idxSource] -= dy;
targetsForcesZ[idxSource] -= dz;
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
}
#endif
#endif
/**
......@@ -327,6 +505,8 @@ public:
}
}
#else
#ifdef ScalFMM_USE_DOUBLE_PRECISION
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
......@@ -398,6 +578,80 @@ public:
}
}
}
#else
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m128 mOne = _mm_set1_ps(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+3)/4;
const __m128*const sourcesPhysicalValues = (const __m128*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m128*const sourcesX = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m128*const sourcesY = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m128*const sourcesZ = (const __m128*)inNeighbors[idxNeighbors]->getPositions()[2];
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m128 tx = _mm_load1_ps(&targetsX[idxTarget]);
const __m128 ty = _mm_load1_ps(&targetsY[idxTarget]);
const __m128 tz = _mm_load1_ps(&targetsZ[idxTarget]);
const __m128 tv = _mm_load1_ps(&targetsPhysicalValues[idxTarget]);
__m128 tfx = _mm_setzero_ps();
__m128 tfy = _mm_setzero_ps();
__m128 tfz = _mm_setzero_ps();
__m128 tpo = _mm_setzero_ps();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m128 dx = sourcesX[idxSource] - tx;
__m128 dy = sourcesY[idxSource] - ty;
__m128 dz = sourcesZ[idxSource] - tz;
__m128 inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m128 inv_distance = _mm_sqrt_ps(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
}
__attribute__((aligned(16))) float buffer[4];
_mm_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
}
}
#endif
#endif
/** P2P mutual interaction,
......
......@@ -41,6 +41,38 @@ inline __m128d operator/(const __m128d& v1, const __m128d& v2){
return _mm_div_pd(v1, v2);
}
inline __m128& operator+=(__m128& v1, const __m128& v2){
return (v1 = _mm_add_ps(v1, v2));
}
inline __m128& operator-=(__m128& v1, const __m128& v2){
return (v1 = _mm_sub_ps(v1, v2));
}
inline __m128& operator*=(__m128& v1, const __m128& v2){
return (v1 = _mm_mul_ps(v1, v2));
}
inline __m128& operator/=(__m128& v1, const __m128& v2){
return (v1 = _mm_div_ps(v1, v2));
}
inline __m128 operator+(const __m128& v1, const __m128& v2){
return _mm_add_ps(v1, v2);
}
inline __m128 operator-(const __m128& v1, const __m128& v2){
return _mm_sub_ps(v1, v2);
}
inline __m128 operator*(const __m128& v1, const __m128& v2){
return _mm_mul_ps(v1, v2);
}
inline __m128 operator/(const __m128& v1, const __m128& v2){
return _mm_div_ps(v1, v2);
}
#endif
#endif // FSSE_HPP
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment