Attention une mise à jour du serveur va être effectuée le vendredi 16 avril entre 12h et 12h30. Cette mise à jour va générer une interruption du service de quelques minutes.

Commit 72b93562 authored by PIACIBELLO Cyrille's avatar PIACIBELLO Cyrille

Directs Kernels are working, in noDist, there is a test for the FullMutual

parent f0b98834
......@@ -159,7 +159,7 @@ namespace FP2P{
//P2P with the last ones
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; /*idxTarget += 4*/idxTarget++){
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; idxTarget++){
for(int idxS = 1 ; idxS < 4-(idxTarget%4) ; ++idxS){
int idxSource = idxTarget + idxS;
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
......@@ -263,185 +263,185 @@ namespace FP2P{
}
#else //Float, ScalFMM_USE_DOUBLE_PRECISION not set
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m256 mOne = _mm256_set1_ps(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+7)/8;
const __m256*const sourcesPhysicalValues = (const __m256*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m256*const sourcesX = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m256*const sourcesY = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m256*const sourcesZ = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[2];
__m256*const sourcesForcesX = (__m256*)inNeighbors[idxNeighbors]->getForcesX();
__m256*const sourcesForcesY = (__m256*)inNeighbors[idxNeighbors]->getForcesY();
__m256*const sourcesForcesZ = (__m256*)inNeighbors[idxNeighbors]->getForcesZ();
__m256*const sourcesPotentials = (__m256*)inNeighbors[idxNeighbors]->getPotentials();
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256 tx = _mm256_broadcast_ss(&targetsX[idxTarget]);
const __m256 ty = _mm256_broadcast_ss(&targetsY[idxTarget]);
const __m256 tz = _mm256_broadcast_ss(&targetsZ[idxTarget]);
const __m256 tv = _mm256_broadcast_ss(&targetsPhysicalValues[idxTarget]);
__m256 tfx = _mm256_setzero_ps();
__m256 tfy = _mm256_setzero_ps();
__m256 tfz = _mm256_setzero_ps();
__m256 tpo = _mm256_setzero_ps();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m256 dx = sourcesX[idxSource] - tx;
__m256 dy = sourcesY[idxSource] - ty;
__m256 dz = sourcesZ[idxSource] - tz;
__m256 inv_square_distance = mOne/(dx*dx + dy*dy + dz*dz);
const __m256 inv_distance = _mm256_rsqrt_ps(dx*dx + dy*dy + dz*dz);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(32))) float buffer[8];
_mm256_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
template <class ContainerClass>
static void FullMutual(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
const FReal*const targetsZ = inTargets->getPositions()[2];
FReal*const targetsForcesX = inTargets->getForcesX();
FReal*const targetsForcesY = inTargets->getForcesY();
FReal*const targetsForcesZ = inTargets->getForcesZ();
FReal*const targetsPotentials = inTargets->getPotentials();
const __m256 mOne = _mm256_set1_ps(1.0);
for(int idxNeighbors = 0 ; idxNeighbors < limiteNeighbors ; ++idxNeighbors){
if( inNeighbors[idxNeighbors] ){
const int nbParticlesSources = (inNeighbors[idxNeighbors]->getNbParticles()+7)/8;
const __m256*const sourcesPhysicalValues = (const __m256*)inNeighbors[idxNeighbors]->getPhysicalValues();
const __m256*const sourcesX = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[0];
const __m256*const sourcesY = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[1];
const __m256*const sourcesZ = (const __m256*)inNeighbors[idxNeighbors]->getPositions()[2];
__m256*const sourcesForcesX = (__m256*)inNeighbors[idxNeighbors]->getForcesX();
__m256*const sourcesForcesY = (__m256*)inNeighbors[idxNeighbors]->getForcesY();
__m256*const sourcesForcesZ = (__m256*)inNeighbors[idxNeighbors]->getForcesZ();
__m256*const sourcesPotentials = (__m256*)inNeighbors[idxNeighbors]->getPotentials();
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256 tx = _mm256_broadcast_ss(&targetsX[idxTarget]);
const __m256 ty = _mm256_broadcast_ss(&targetsY[idxTarget]);
const __m256 tz = _mm256_broadcast_ss(&targetsZ[idxTarget]);
const __m256 tv = _mm256_broadcast_ss(&targetsPhysicalValues[idxTarget]);
__m256 tfx = _mm256_setzero_ps();
__m256 tfy = _mm256_setzero_ps();
__m256 tfz = _mm256_setzero_ps();
__m256 tpo = _mm256_setzero_ps();
for(int idxSource = 0 ; idxSource < nbParticlesSources ; ++idxSource){
__m256 dx = sourcesX[idxSource] - tx;
__m256 dy = sourcesY[idxSource] - ty;
__m256 dz = sourcesZ[idxSource] - tz;
__m256 inv_square_distance = mOne/(dx*dx + dy*dy + dz*dz);
const __m256 inv_distance = _mm256_rsqrt_ps(dx*dx + dy*dy + dz*dz);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(32))) float buffer[8];
_mm256_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
}
}
}
}
}
{
const int nbParticlesSources = (nbParticlesTargets+7)/8;
const __m256*const sourcesPhysicalValues = (const __m256*)targetsPhysicalValues;
const __m256*const sourcesX = (const __m256*)targetsX;
const __m256*const sourcesY = (const __m256*)targetsY;
const __m256*const sourcesZ = (const __m256*)targetsZ;
__m256*const sourcesForcesX = (__m256*)targetsForcesX;
__m256*const sourcesForcesY = (__m256*)targetsForcesY;
__m256*const sourcesForcesZ = (__m256*)targetsForcesZ;
__m256*const sourcesPotentials = (__m256*)targetsPotentials;
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256 tx = _mm256_broadcast_ss(&targetsX[idxTarget]);
const __m256 ty = _mm256_broadcast_ss(&targetsY[idxTarget]);
const __m256 tz = _mm256_broadcast_ss(&targetsZ[idxTarget]);
const __m256 tv = _mm256_broadcast_ss(&targetsPhysicalValues[idxTarget]);
__m256 tfx = _mm256_setzero_ps();
__m256 tfy = _mm256_setzero_ps();
__m256 tfz = _mm256_setzero_ps();
__m256 tpo = _mm256_setzero_ps();
for(int idxSource = (idxTarget+2)/2 ; idxSource < nbParticlesSources ; ++idxSource){
__m256 dx = sourcesX[idxSource] - tx;
__m256 dy = sourcesY[idxSource] - ty;
__m256 dz = sourcesZ[idxSource] - tz;
__m256 inv_square_distance = mOne/(dx*dx + dy*dy + dz*dz);
const __m256 inv_distance = _mm256_rsqrt_ps(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
{
const int nbParticlesSources = (nbParticlesTargets+7)/8;
const __m256*const sourcesPhysicalValues = (const __m256*)targetsPhysicalValues;
const __m256*const sourcesX = (const __m256*)targetsX;
const __m256*const sourcesY = (const __m256*)targetsY;
const __m256*const sourcesZ = (const __m256*)targetsZ;
__m256*const sourcesForcesX = (__m256*)targetsForcesX;
__m256*const sourcesForcesY = (__m256*)targetsForcesY;
__m256*const sourcesForcesZ = (__m256*)targetsForcesZ;
__m256*const sourcesPotentials = (__m256*)targetsPotentials;
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
const __m256 tx = _mm256_broadcast_ss(&targetsX[idxTarget]);
const __m256 ty = _mm256_broadcast_ss(&targetsY[idxTarget]);
const __m256 tz = _mm256_broadcast_ss(&targetsZ[idxTarget]);
const __m256 tv = _mm256_broadcast_ss(&targetsPhysicalValues[idxTarget]);
__m256 tfx = _mm256_setzero_ps();
__m256 tfy = _mm256_setzero_ps();
__m256 tfz = _mm256_setzero_ps();
__m256 tpo = _mm256_setzero_ps();
for(int idxSource = (idxTarget+8)/8 ; idxSource < nbParticlesSources ; ++idxSource){
__m256 dx = sourcesX[idxSource] - tx;
__m256 dy = sourcesY[idxSource] - ty;
__m256 dz = sourcesZ[idxSource] - tz;
__m256 inv_square_distance = mOne/(dx*dx + dy*dy + dz*dz);
const __m256 inv_distance = _mm256_rsqrt_ps(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= tv * sourcesPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
tfx += dx;
tfy += dy;
tfz += dz;
tpo += inv_distance * sourcesPhysicalValues[idxSource];
sourcesForcesX[idxSource] -= dx;
sourcesForcesY[idxSource] -= dy;
sourcesForcesZ[idxSource] -= dz;
sourcesPotentials[idxSource] += inv_distance * tv;
}
__attribute__((aligned(32))) float buffer[8];
_mm256_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
}
}
__attribute__((aligned(32))) float buffer[8];
_mm256_store_ps(buffer, tfx);
targetsForcesX[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfy);
targetsForcesY[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tfz);
targetsForcesZ[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
_mm256_store_ps(buffer, tpo);
targetsPotentials[idxTarget] += buffer[0] + buffer[1] + buffer[2] + buffer[3] + buffer[4] + buffer[5] + buffer[6] + buffer[7];
}
}
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; idxTarget += 4){
for(int idxClose = 1 ; idxClose < 4; ++idxClose){
const int idxSource = idxTarget + idxClose;
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
FReal dy = targetsY[idxSource] - targetsY[idxTarget];
FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * targetsPhysicalValues[idxSource];
for(int idxTarget = 0 ; idxTarget < nbParticlesTargets ; ++idxTarget){
for(int idxS = 1 ; idxS < 8-(idxTarget%8) ; ++idxS){
const int idxSource = idxTarget + idxS;
FReal dx = targetsX[idxSource] - targetsX[idxTarget];
FReal dy = targetsY[idxSource] - targetsY[idxTarget];
FReal dz = targetsZ[idxSource] - targetsZ[idxTarget];
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
const FReal inv_distance = FMath::Sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetsPhysicalValues[idxTarget] * targetsPhysicalValues[idxSource];
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
targetsForcesX[idxTarget] += dx;
targetsForcesY[idxTarget] += dy;
targetsForcesZ[idxTarget] += dz;
targetsPotentials[idxTarget] += inv_distance * targetsPhysicalValues[idxSource];
targetsForcesX[idxSource] -= dx;
targetsForcesY[idxSource] -= dy;
targetsForcesZ[idxSource] -= dz;
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
targetsForcesX[idxSource] -= dx;
targetsForcesY[idxSource] -= dy;
targetsForcesZ[idxSource] -= dz;
targetsPotentials[idxSource] += inv_distance * targetsPhysicalValues[idxTarget];
}
}
}
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
template <class ContainerClass>
static void FullRemote(ContainerClass* const FRestrict inTargets, ContainerClass* const inNeighbors[],
const int limiteNeighbors){
const int nbParticlesTargets = inTargets->getNbParticles();
const FReal*const targetsPhysicalValues = inTargets->getPhysicalValues();
const FReal*const targetsX = inTargets->getPositions()[0];
const FReal*const targetsY = inTargets->getPositions()[1];
......
This diff is collapsed.
......@@ -55,7 +55,7 @@
int main(int argc, char ** argv){
//
///////////////////////What we do/////////////////////////////
if( FParameters::existParameter(argc, argv, "-help" ) || argc < 4){
if( FParameters::existParameter(argc, argv, "-help" )){
std::cout << ">> This executable has to be used to compute interaction either for periodic or non periodic system.\n";
std::cout << ">> Example -fin filenameIN.{fma or bfma) -fout filenameOUT{fma or bfma) \n";
std::cout << ">> Default input file : ../Data/unitCubeXYZQ20k.fma\n";
......@@ -103,11 +103,6 @@ int main(int argc, char ** argv){
for(int idx = 0 ; idx<nbParticles ; ++idx){
//
loader.fillParticle(particles[idx].getPtrFirstData(), particles[idx].getReadDataNumber());
// loader.fillParticle(particles[idx].getPtrFirstData(), nbDataToRead); // OK
// loader.fillParticle(particles[idx]); // OK
// std::cout << idx <<" "<< particles[idx].getPosition() << " "<<particles[idx].getPhysicalValue() << " "<<particles[idx].getPotential()
// <<" " << particles[idx].getForces()[0]<<" " <<particles[idx].getForces()[1]<<" " <<particles[idx].getForces()[2]<<" " <<std::endl;
//
totalCharge += particles[idx].getPhysicalValue() ;
}
......@@ -127,12 +122,12 @@ int main(int argc, char ** argv){
OctreeClass tree(2, 1, loader.getBoxWidth(), loader.getCenterOfBox());
for(int idxP=0 ; idxP<nbParticles ; ++idxP){
tree.insert(particles[idxP].getPosition());
tree.insert(particles[idxP].getPosition(),particles[idxP].getPhysicalValue());
}
int i=0;
tree.forEachLeaf([&](LeafClass * leaf){
printf("leaf : %d\n",i++ );
});
int k=0;
// tree.forEachLeaf([&](LeafClass * leaf){
// printf("leaf : %d\n",k++ );
// });
//
// ----------------------------------------------------------------------------------------------------------
// COMPUTATION
......@@ -141,6 +136,25 @@ int main(int argc, char ** argv){
//
// computation
//
#ifdef ScalFMM_USE_DOUBLE_PRECISION
printf("Double precision \n");
#else
printf("Simple precision \n");
#endif
#ifdef ScalFMM_USE_AVX
printf("AVX incomming .......\n\n");
#endif
#ifdef ScalFMM_USE_SSE
printf("SSE incomming .......\n\n");
#endif
#ifndef ScalFMM_USE_SSE
#ifndef ScalFMM_USE_AVX
printf("Classic incomming ...\n\n");
#endif
#endif
counter.tic();
{
typename OctreeClass::Iterator iterator(&tree);
iterator.gotoBottomLeft();
......@@ -153,6 +167,19 @@ int main(int argc, char ** argv){
}while(iterator.moveRight());
}
counter.tac();
std::cout << "Done " << "(@ Computation " << counter.elapsed() << " s)." << std::endl;
FReal cumulPot = 0.0;
k=0;
tree.forEachLeaf([&](LeafClass * leaf){
int maxParts = leaf->getSrc()->getNbParticles();
FReal* datas = leaf->getSrc()->getPotentials();
for(int i=0 ; i<maxParts ; ++i){
cumulPot += datas[i];
}
printf("leaf : %d --> cumul pot %e : \n",k++, cumulPot);
});
delete[] particles;
return 0;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment