Commit 35c6eef4 authored by BRAMAS Berenger's avatar BRAMAS Berenger
Browse files

make the P2P working on cuda but with 1 thread for now

parent ae7737ec
......@@ -875,21 +875,21 @@ template dim3 FCuda__GetBlockSize< FTestCudaKernels<double> >(FTestCudaKernels<d
#include "../P2P/FCudaP2P.hpp"
template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
unsigned char* containersPtr, std::size_t containersSize,
FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
int idxLevel, const OutOfBlockInteraction* outsideInteractions,
......@@ -897,14 +897,14 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
......@@ -914,14 +914,14 @@ int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
const OutOfBlockInteraction* outsideInteractions,
......@@ -929,13 +929,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
const int treeHeight, FCudaP2P<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
const OutOfBlockInteraction* outsideInteractions,
......@@ -943,7 +943,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
FCudaP2P<float>* kernel, cudaStream_t currentStream,
......@@ -959,21 +959,21 @@ template dim3 FCuda__GetBlockSize< FCudaP2P<float> >(FCudaP2P<float>* cukernel);
template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
unsigned char* containersPtr, std::size_t containersSize,
FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
int idxLevel, const OutOfBlockInteraction* outsideInteractions,
......@@ -981,14 +981,14 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
......@@ -998,14 +998,14 @@ int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
const OutOfBlockInteraction* outsideInteractions,
......@@ -1013,13 +1013,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
const int treeHeight, FCudaP2P<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
const OutOfBlockInteraction* outsideInteractions,
......@@ -1027,7 +1027,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
FCudaP2P<double>* kernel, cudaStream_t currentStream,
......
......@@ -37,7 +37,7 @@ public:
positionsPointers[2] = reinterpret_cast<FReal*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*2);
for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){
attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*(idxAttribute+3));
attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*3 + inLeadingAttributes*idxAttribute);
}
// Redirect pointers to data
......
......@@ -6,29 +6,6 @@
#include "../Cuda/FCudaEmptyCellSymb.hpp"
#include "../Cuda/FCudaCompositeCell.hpp"
#define DirectMacro(targetX, targetY, targetZ, targetPhys, \
forceX, forceY, forceZ, potential,\
sourcesX, sourcesY, sourcesZ, sourcesPhys)\
{\
FReal dx = sourcesX - targetX;\
FReal dy = sourcesY - targetY;\
FReal dz = sourcesZ - targetZ;\
\
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);\
FReal inv_distance = sqrt(inv_square_distance);\
\
inv_square_distance *= inv_distance;\
inv_square_distance *= targetPhys * sourcesPhys;\
\
dx *= inv_square_distance;\
dy *= inv_square_distance;\
dz *= inv_square_distance;\
\
forceX += dx;\
forceY += dy;\
forceZ += dz;\
sourcesPhys += inv_distance * sourcesPhys;\
}
#define Min(x,y) ((x)<(y)?(x):(y))
#define Max(x,y) ((x)>(y)?(x):(y))
......@@ -40,6 +17,30 @@ template <class FReal>
class FCudaP2P {
protected:
public:
__device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const FReal& targetPhys,
FReal& forceX, FReal& forceY,FReal& forceZ, FReal& potential,
const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const {
FReal dx = sourcesX - targetX;
FReal dy = sourcesY - targetY;
FReal dz = sourcesZ - targetZ;
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
FReal inv_distance = sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetPhys * sourcesPhys;
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
forceX += dx;
forceY += dy;
forceZ += dz;
potential += inv_distance * sourcesPhys;
}
static double DSqrt(const double val){
return sqrt(val);
}
......@@ -48,10 +49,10 @@ public:
return sqrtf(val);
}
typedef FCudaGroupAttachedLeaf<FReal,4,4,FReal> ContainerClass;
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass;
static const int SHARE_SIZE = 128;
static const int SHARE_SIZE = 1;//128;
__device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
}
......@@ -60,8 +61,8 @@ public:
}
__device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/,
const int* /*neighPositions*/,
const int /*size*/, const int /*level*/) {
const int* /*neighPositions*/,
const int /*size*/, const int /*level*/) {
}
__device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) {
......@@ -86,12 +87,10 @@ public:
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
if(threadCompute){
targetX = targets->getPositions()[0][idxPart];
targetY = targets->getPositions()[1][idxPart];
targetZ = targets->getPositions()[2][idxPart];
targetPhys = targets->getAttribute(0)[idxPart];
}
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
......@@ -101,57 +100,60 @@ public:
const int nbCopies = Min(SHARE_SIZE, targets->getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = targets->getPositions()[0][idxPart];
sourcesY[threadIdx.x] = targets->getPositions()[1][idxPart];
sourcesZ[threadIdx.x] = targets->getPositions()[2][idxPart];
sourcesPhys[threadIdx.x] = targets->getAttribute(0)[idxPart];
sourcesX[threadIdx.x] = targets->getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = targets->getPositions()[1][threadIdx.x+idxCopy];
sourcesZ[threadIdx.x] = targets->getPositions()[2][threadIdx.x+idxCopy];
sourcesPhys[threadIdx.x] = targets->getAttribute(0)[threadIdx.x+idxCopy];
}
__syncthreads();
if(threadCompute){
const int leftCopies = Min(idxPart, nbCopies);
int leftCopies = nbCopies;
if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){
leftCopies = idxPart - idxCopy;
}
// Left Part
for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (leftCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
// Right Part
for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = Max(leftCopies+1, (nbCopies/4) * 4); otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
}
......@@ -159,10 +161,10 @@ public:
}
if( threadCompute ){
targets->getAttribute(1)[idxPart] += forceX;
targets->getAttribute(2)[idxPart] += forceY;
targets->getAttribute(3)[idxPart] += forceZ;
targets->getAttribute(4)[idxPart] += potential;
targets->getAttribute(1)[idxPart] += potential;
targets->getAttribute(2)[idxPart] += forceX;
targets->getAttribute(3)[idxPart] += forceY;
targets->getAttribute(4)[idxPart] += forceZ;
}
}
......@@ -182,12 +184,10 @@ public:
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
if(threadCompute){
targetX = targets->getPositions()[0][idxPart];
targetY = targets->getPositions()[1][idxPart];
targetZ = targets->getPositions()[2][idxPart];
targetPhys = targets->getAttribute(0)[idxPart];
}
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
......@@ -197,34 +197,34 @@ public:
const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][idxPart];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][idxPart];
sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][idxPart];
sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[idxPart];
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy];
sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy];
}
__syncthreads();
if(threadCompute){
for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
}
......@@ -232,10 +232,10 @@ public:
}
if( threadCompute ){
targets->getAttribute(1)[idxPart] += forceX;
targets->getAttribute(2)[idxPart] += forceY;
targets->getAttribute(3)[idxPart] += forceZ;
targets->getAttribute(4)[idxPart] += potential;
targets->getAttribute(1)[idxPart] += potential;
targets->getAttribute(2)[idxPart] += forceX;
targets->getAttribute(3)[idxPart] += forceY;
targets->getAttribute(4)[idxPart] += forceZ;
}
}
......@@ -243,9 +243,9 @@ public:
}
__device__ void P2POuter(const int3& ,
ContainerClass* const targets,
ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){
ContainerClass* const targets,
ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
......@@ -256,12 +256,10 @@ public:
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
if(threadCompute){
targetX = targets->getPositions()[0][idxPart];
targetY = targets->getPositions()[1][idxPart];
targetZ = targets->getPositions()[2][idxPart];
targetPhys = targets->getAttribute(0)[idxPart];
}
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
__shared__ FReal sourcesX[SHARE_SIZE];
......@@ -271,34 +269,34 @@ public:
const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
if(threadIdx.x < nbCopies){
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][idxPart];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][idxPart];
sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][idxPart];
sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[idxPart];
sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy];
sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy];
}
__syncthreads();
if(threadCompute){
for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectMacro(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,