Commit 3610005a authored by PIACIBELLO Cyrille's avatar PIACIBELLO Cyrille

ScalFMM supports for AVX, changes in P2P arch

parent ffd6d769
......@@ -53,6 +53,7 @@ OPTION( ScalFMM_USE_DOUBLE_PRECISION "Set to ON to compile in double precision"
OPTION( ScalFMM_ATTACHE_SOURCE "Set to ON to compile with -g" OFF )
OPTION( ScalFMM_USE_ADDONS "Set to ON to compile add ons" OFF )
OPTION( ScalFMM_USE_SSE "Set to ON to compile with SSE support" ON )
OPTION( ScalFMM_USE_AVX "Set to ON to compile with AVX support" OFF )
OPTION( ScalFMM_USE_ASSERT "Set to ON to enable safe tests during execution" ON )
OPTION( ScalFMM_USE_MIC_NATIVE "Set to ON to compile in native mode for MIC" OFF )
# Set scalfmm to default libraries
......@@ -195,6 +196,19 @@ if( ScalFMM_USE_SSE )
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3 -mfpmath=sse")
ENDIF()
endif()
#Use AVX
MESSAGE(STATUS "ScalFMM_USE_AVX = ${ScalFMM_USE_AVX}")
if(ScalFMM_USE_AVX)
IF(CMAKE_CXX_COMPILER_ID STREQUAL "Intel")
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -vec -axAVX")
ELSE()
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -msse -msse2 -msse3 -mfpmath=sse -mavx")
ENDIF()
endif()
if( ScalFMM_USE_MIC_NATIVE )
SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mmic")
......
This diff is collapsed.
#ifndef FP2PAVX_HPP
#define FP2PAVX_HPP
#include "../../Utils/FGlobal.hpp"
#include "../../Utils/FMath.hpp"
#include "../../Utils/FAvx.hpp"
namespace FP2P{
#ifdef ScalFMM_USE_DOUBLE_PRECISION
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m256d mOne = _mm256_set1_pd(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+3)/4;
const __m256d*const sourcesPhysicalValues = (const __m256d*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m256d*const sourcesX = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m256d*const sourcesY = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m256d*const sourcesZ = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[2];
__m256d*const sourcesForcesX = (__m256d*)inNeighbors[idxNeighbors]->getForcesX();
__m256d*const sourcesForcesY = (__m256d*)inNeighbors[idxNeighbors]->getForcesY();
__m256d*const sourcesForcesZ = (__m256d*)inNeighbors[idxNeighbors]->getForcesZ();
__m256d*const sourcesPotentials = (__m256d*)inNeighbors[idxNeighbors]->getPotentials();
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256d tx = _mm256_broadcast_sd(&targetsX[idxTarget]);
const __m256d ty = _mm256_broadcast_sd(&targetsY[idxTarget]);
const __m256d tz = _mm256_broadcast_sd(&targetsZ[idxTarget]);
const __m256d tv = _mm256_broadcast_sd(&targetsPhysicalValues[idxTarget]);
__m256d tfx = _mm256_setzero_pd();
__m256d tfy = _mm256_setzero_pd();
__m256d tfz = _mm256_setzero_pd();
__m256d tpo = _mm256_setzero_pd();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m256d dx = sourcesX[idxSource] - tx;
__m256d dy = sourcesY[idxSource] - ty;
__m256d dz = sourcesZ[idxSource] - tz;
__m256d inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m256d inv_distance = _mm256_sqrt_pd(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(32))) double buffer[4];
_mm256_store_pd(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
}
{
const int nbParticlesSources = (nbParticlesTargets+3)/4;
const __m256d*const sourcesPhysicalValues = (const __m256d*)targetsPhysicalValues;
const __m256d*const sourcesX = (const __m256d*)targetsX;
const __m256d*const sourcesY = (const __m256d*)targetsY;
const __m256d*const sourcesZ = (const __m256d*)targetsZ;
__m256d*const sourcesForcesX = (__m256d*)targetsForcesX;
__m256d*const sourcesForcesY = (__m256d*)targetsForcesY;
__m256d*const sourcesForcesZ = (__m256d*)targetsForcesZ;
__m256d*const sourcesPotentials = (__m256d*)targetsPotentials;
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256d tx = _mm256_broadcast_sd(&targetsX[idxTarget]);
const __m256d ty = _mm256_broadcast_sd(&targetsY[idxTarget]);
const __m256d tz = _mm256_broadcast_sd(&targetsZ[idxTarget]);
const __m256d tv = _mm256_broadcast_sd(&targetsPhysicalValues[idxTarget]);
__m256d tfx = _mm256_setzero_pd();
__m256d tfy = _mm256_setzero_pd();
__m256d tfz = _mm256_setzero_pd();
__m256d tpo = _mm256_setzero_pd();
for(int idxSource = (idxTarget+2)/2 ; idxSource < nbParticlesSources ; ++idxSource){
__m256d dx = sourcesX[idxSource] - tx;
__m256d dy = sourcesY[idxSource] - ty;
__m256d dz = sourcesZ[idxSource] - tz;
__m256d inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m256d inv_distance = _mm256_sqrt_pd(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(32))) double buffer[4];
_mm256_store_pd(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; idxTarget += 2){
const int idxSource = idxTarget + 1;
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
FReal dy = targetsY[idxSource] - targetsY[idxTarget];
FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * targetsPhysicalValues[idxSource];
targetsForcesX[idxSource] -= dx;
targetsForcesY[idxSource] -= dy;
targetsForcesZ[idxSource] -= dz;
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m256d mOne = _mm256_set1_pd(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+3)/4;
const __m256d*const sourcesPhysicalValues = (const __m256d*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m256d*const sourcesX = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m256d*const sourcesY = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m256d*const sourcesZ = (const __m256d*)inNeighbors[idxNeighbors]->getPositions()[2];
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256d tx = _mm_load1_pd(&targetsX[idxTarget]);
const __m256d ty = _mm_load1_pd(&targetsY[idxTarget]);
const __m256d tz = _mm_load1_pd(&targetsZ[idxTarget]);
const __m256d tv = _mm_load1_pd(&targetsPhysicalValues[idxTarget]);
__m256d tfx = _mm256_setzero_pd();
__m256d tfy = _mm256_setzero_pd();
__m256d tfz = _mm256_setzero_pd();
__m256d tpo = _mm256_setzero_pd();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m256d dx = sourcesX[idxSource] - tx;
__m256d dy = sourcesY[idxSource] - ty;
__m256d dz = sourcesZ[idxSource] - tz;
__m256d inv_square_distance = mOne / (dx*dx + dy*dy + dz*dz);
const __m256d inv_distance = _mm256_sqrt_pd(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
}
__attribute__((aligned(32))) double buffer[4];
_mm256_store_pd(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
_mm256_store_pd(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3];
}
}
}
}
#else //Float, ScalFMM_USE_DOUBLE_PRECISION not set
}
#endif //FP2PAVX_HPP
#ifndef FP2PCLASSIC_HPP
#define FP2PCLASSIC_HPP
#include "../../Utils/FGlobal.hpp"
#include "../../Utils/FMath.hpp"
namespace FP2P{
template <class ContainerClass>
inline void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues();
const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
FReal*const sourcesForcesX = inNeighbors[idxNeighbors]->getForcesX();
FReal*const sourcesForcesY = inNeighbors[idxNeighbors]->getForcesY();
FReal*const sourcesForcesZ = inNeighbors[idxNeighbors]->getForcesZ();
FReal*const sourcesPotentials = inNeighbors[idxNeighbors]->getPotentials();
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
FReal dx = sourcesX[idxSource] - targetsX[idxTarget];
FReal dy = sourcesY[idxSource] - targetsY[idxTarget];
FReal dz = sourcesZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
}
}
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
for(int idxSource = idxTarget + 1 ; idxSource < nbParticlesTargets ; ++idxSource){
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
FReal dy = targetsY[idxSource] - targetsY[idxTarget];
FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * targetsPhysicalValues[idxSource];
targetsForcesX[idxSource] -= dx;
targetsForcesY[idxSource] -= dy;
targetsForcesZ[idxSource] -= dz;
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
}
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = inNeighbors[idxNeighbors]->getNbParticles();
const FReal*const sourcesPhysicalValues = inNeighbors[idxNeighbors]->getPhysicalValues();
const FReal*const sourcesX = inNeighbors[idxNeighbors]->getPositions()[0];
const FReal*const sourcesY = inNeighbors[idxNeighbors]->getPositions()[1];
const FReal*const sourcesZ = inNeighbors[idxNeighbors]->getPositions()[2];
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
FReal dx = sourcesX[idxSource] - targetsX[idxTarget];
FReal dy = sourcesY[idxSource] - targetsY[idxTarget];
FReal dz = sourcesZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * sourcesPhysicalValues[idxSource];
}
}
}
}
}
}
#endif //FP2PCLASSIC_HPP
This diff is collapsed.
......@@ -35,6 +35,18 @@ inline void* Allocate16BAligned(const size_t inSize){
return reinterpret_cast<void*>( (reinterpret_cast<long long int>(memoryBlock) + 16 + 15) & ~0xFLL);
}
/**
* @brief Allocate32BAligned
* @param inSize in Bytes
* @return the address of the allocated block of size inSize
*/
inline void* Allocate32BAligned(const size_t inSize){
unsigned char* memoryBlock;
int resMemAl = posix_memalign((void**)&memoryBlock,32,inSize);
return memoryBlock;
}
/**
* Allocate an array of inNbElements elements of type ObjectType.
* The objects are not initialized!
......@@ -53,6 +65,15 @@ inline void Dealloc16BAligned(const void*const memoryBlock){
delete[] reinterpret_cast<const unsigned char*>(*storeRealAddress);
}
}
/**
* Delete a block allocated with Allocate32BAligned
*/
inline void Dealloc32BAligned(const void*const memoryBlock){
const void * toBeFreed = memoryBlock;
free((void *)toBeFreed);
}
}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment