Commit f0494086 authored by BRAMAS Berenger's avatar BRAMAS Berenger
Browse files

make the P2P working on CUDA GPUs (several thread but one thread-team)

parent 6adba7f1
...@@ -78,10 +78,8 @@ public: ...@@ -78,10 +78,8 @@ public:
// Compute with other // Compute with other
P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter); P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter);
// Compute inside // Compute inside
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){ for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
const int idxPart = (idxLoop*blockDim.x+threadIdx.x);
const bool threadCompute = (idxPart < targets->getNbParticles()); const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys; FReal targetX, targetY, targetZ, targetPhys;
...@@ -113,6 +111,7 @@ public: ...@@ -113,6 +111,7 @@ public:
if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){ if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){
leftCopies = idxPart - idxCopy; leftCopies = idxPart - idxCopy;
} }
// Left Part // Left Part
for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4 for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4
DirectComputation(targetX, targetY, targetZ, targetPhys, DirectComputation(targetX, targetY, targetZ, targetPhys,
...@@ -150,7 +149,7 @@ public: ...@@ -150,7 +149,7 @@ public:
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
} }
for(int otherIndex = Max(leftCopies+1, (nbCopies/4) * 4); otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys, DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential, forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
...@@ -167,6 +166,7 @@ public: ...@@ -167,6 +166,7 @@ public:
targets->getAttribute(4)[idxPart] += forceZ; targets->getAttribute(4)[idxPart] += forceZ;
} }
__syncthreads();
} }
} }
...@@ -175,10 +175,8 @@ public: ...@@ -175,10 +175,8 @@ public:
ContainerClass* const directNeighborsParticles, ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){ const int* /*neighborsPositions*/, const int counter){
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){ for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
const int idxPart = (idxLoop*blockDim.x+threadIdx.x);
const bool threadCompute = (idxPart < targets->getNbParticles()); const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys; FReal targetX, targetY, targetZ, targetPhys;
...@@ -238,6 +236,8 @@ public: ...@@ -238,6 +236,8 @@ public:
targets->getAttribute(4)[idxPart] += forceZ; targets->getAttribute(4)[idxPart] += forceZ;
} }
__syncthreads();
} }
} }
} }
...@@ -247,10 +247,8 @@ public: ...@@ -247,10 +247,8 @@ public:
ContainerClass* const directNeighborsParticles, ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){ const int* /*neighborsPositions*/, const int counter){
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){ for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
const int idxPart = (idxLoop*blockDim.x+threadIdx.x);
const bool threadCompute = (idxPart < targets->getNbParticles()); const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys; FReal targetX, targetY, targetZ, targetPhys;
...@@ -310,6 +308,7 @@ public: ...@@ -310,6 +308,7 @@ public:
targets->getAttribute(4)[idxPart] += forceZ; targets->getAttribute(4)[idxPart] += forceZ;
} }
__syncthreads();
} }
} }
} }
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment