Commit eb3564f0 authored by BRAMAS Berenger's avatar BRAMAS Berenger

prepare unif kernel for cuda M2L

parent a4f39104
......@@ -455,7 +455,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
message( FATAL_ERROR "nvcc is needed with CUDA." )
endif()
if(NOT DEFINED CUSTOM_CUDA_FLAGS)
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20" CACHE
set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;-ptxas-options=-v;-use_fast_math" CACHE
STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
endif()
# This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
......
......@@ -1093,3 +1093,181 @@ template void FCuda__ReleaseCudaKernel<FCudaP2P<double>>(FCudaP2P<double>* cuker
template dim3 FCuda__GetGridSize< FCudaP2P<double> >(FCudaP2P<double>* kernel, int intervalSize);
template dim3 FCuda__GetBlockSize< FCudaP2P<double> >(FCudaP2P<double>* cukernel);
/////////////////////////////////////////////////////////////////////////////////////
/////////////////////////////////////////////////////////////////////////////////////
#include "../Uniform/FUnifCuda.hpp"
template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
unsigned char* containersPtr, std::size_t containersSize,
FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell,
const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
const int safeOuterInteractions[], const int counterOuterCell,
const OutOfBlockInteraction* insideInteractions,
const int safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
FUnifCuda<float>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template FUnifCuda<float>* FCuda__BuildCudaKernel<FUnifCuda<float>>(void* kernel);
template void FCuda__ReleaseCudaKernel<FUnifCuda<float>>(FUnifCuda<float>* cukernel);
template dim3 FCuda__GetGridSize< FUnifCuda<float> >(FUnifCuda<float>* kernel, int intervalSize);
template dim3 FCuda__GetBlockSize< FUnifCuda<float> >(FUnifCuda<float>* cukernel);
template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
unsigned char* containersPtr, std::size_t containersSize,
FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
int idxLevel, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
unsigned char* currentCellsDownPtr,
unsigned char* externalCellsPtr, std::size_t externalCellsSize,
unsigned char* externalCellsUpPtr,
int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#ifdef SCALFMM_USE_MPI
template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize,
const OutOfBlockInteraction* outsideInteractions,
int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell,
const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
#endif
template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
const int safeOuterInteractions[], const int counterOuterCell,
const OutOfBlockInteraction* insideInteractions,
const int safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
(unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
FUnifCuda<double>* kernel, cudaStream_t currentStream,
const dim3 inGridSize, const dim3 inBlocksSize);
template FUnifCuda<double>* FCuda__BuildCudaKernel<FUnifCuda<double>>(void* kernel);
template void FCuda__ReleaseCudaKernel<FUnifCuda<double>>(FUnifCuda<double>* cukernel);
template dim3 FCuda__GetGridSize< FUnifCuda<double> >(FUnifCuda<double>* kernel, int intervalSize);
template dim3 FCuda__GetBlockSize< FUnifCuda<double> >(FUnifCuda<double>* cukernel);
#ifndef FUNIFCUDA_HPP
#define FUNIFCUDA_HPP
#include "../Cuda/FCudaGlobal.hpp"
#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
#include "../Cuda/FCudaEmptyCellSymb.hpp"
#include "../Cuda/FCudaCompositeCell.hpp"
#include "FUnifCellPOD.hpp"
#define Min(x,y) ((x)<(y)?(x):(y))
#define Max(x,y) ((x)>(y)?(x):(y))
/**
* This class defines what should be a Cuda kernel.
*/
template <class FReal>
class FUnifCuda {
protected:
public:
__device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const FReal& targetPhys,
FReal& forceX, FReal& forceY,FReal& forceZ, FReal& potential,
const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const {
FReal dx = sourcesX - targetX;
FReal dy = sourcesY - targetY;
FReal dz = sourcesZ - targetZ;
FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
FReal inv_distance = sqrt(inv_square_distance);
inv_square_distance *= inv_distance;
inv_square_distance *= targetPhys * sourcesPhys;
dx *= inv_square_distance;
dy *= inv_square_distance;
dz *= inv_square_distance;
forceX += dx;
forceY += dy;
forceZ += dz;
potential += inv_distance * sourcesPhys;
}
static double DSqrt(const double val){
return sqrt(val);
}
static float FSqrt(const float val){
return sqrtf(val);
}
typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass;
static const int NB_THREAD_GROUPS = 30; // 2 x 15
static const int THREAD_GROUP_SIZE = 256;
static const int SHARED_MEMORY_SIZE = 512;// 49152
__device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
}
__device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) {
}
__device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/,
const int* /*neighPositions*/,
const int /*size*/, const int /*level*/) {
}
__device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) {
}
__device__ void L2P(const CellClass /*local*/, ContainerClass*const /*particles*/){
}
__device__ void P2P(const int3& pos,
ContainerClass* const targets, const ContainerClass* const sources,
ContainerClass* const directNeighborsParticles,
const int* neighborPositions, const int counter){
// Compute with other
P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter);
// Compute inside
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
__shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
const int nbCopies = Min(SHARED_MEMORY_SIZE, targets->getNbParticles()-idxCopy);
for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
sourcesX[idx] = targets->getPositions()[0][idx+idxCopy];
sourcesY[idx] = targets->getPositions()[1][idx+idxCopy];
sourcesZ[idx] = targets->getPositions()[2][idx+idxCopy];
sourcesPhys[idx] = targets->getAttribute(0)[idx+idxCopy];
}
__syncthreads();
if(threadCompute){
int leftCopies = nbCopies;
if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){
leftCopies = idxPart - idxCopy;
}
// Left Part
for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
// Right Part
for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
}
__syncthreads();
}
if( threadCompute ){
targets->getAttribute(1)[idxPart] += potential;
targets->getAttribute(2)[idxPart] += forceX;
targets->getAttribute(3)[idxPart] += forceY;
targets->getAttribute(4)[idxPart] += forceZ;
}
__syncthreads();
}
}
__device__ void P2PRemote(const int3& ,
ContainerClass* const targets, const ContainerClass* const /*sources*/,
ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
__shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy];
sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy];
sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy];
sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy];
}
__syncthreads();
if(threadCompute){
for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
}
__syncthreads();
}
if( threadCompute ){
targets->getAttribute(1)[idxPart] += potential;
targets->getAttribute(2)[idxPart] += forceX;
targets->getAttribute(3)[idxPart] += forceY;
targets->getAttribute(4)[idxPart] += forceZ;
}
__syncthreads();
}
}
}
__device__ void P2POuter(const int3& ,
ContainerClass* const targets,
ContainerClass* const directNeighborsParticles,
const int* /*neighborsPositions*/, const int counter){
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
const bool threadCompute = (idxPart < targets->getNbParticles());
FReal targetX, targetY, targetZ, targetPhys;
FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
__shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
__shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy];
sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy];
sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy];
sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy];
}
__syncthreads();
if(threadCompute){
for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
}
for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
DirectComputation(targetX, targetY, targetZ, targetPhys,
forceX, forceY, forceZ, potential,
sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
}
}
__syncthreads();
}
if( threadCompute ){
targets->getAttribute(1)[idxPart] += potential;
targets->getAttribute(2)[idxPart] += forceX;
targets->getAttribute(3)[idxPart] += forceY;
targets->getAttribute(4)[idxPart] += forceZ;
}
__syncthreads();
}
}
}
__host__ static FUnifCuda* InitKernelKernel(void*){
return nullptr;
}
__host__ static void ReleaseKernel(FUnifCuda* /*todealloc*/){
// nothing to do
}
__host__ static dim3 GetGridSize(const int /*intervalSize*/){
return NB_THREAD_GROUPS;
}
__host__ static dim3 GetBlocksSize(){
return THREAD_GROUP_SIZE;
}
};
#endif // FUNIFCUDA_HPP
......@@ -18,7 +18,7 @@
#ifndef FUNIFTENSOR_HPP
#define FUNIFTENSOR_HPP
#include "Utils/FMath.hpp"
#include "../../Utils/FMath.hpp"
#include "FUnifRoots.hpp"
#include "../Interpolation/FInterpTensor.hpp"
......
......@@ -47,7 +47,7 @@
#include <memory>
template <class FReal>
class FCudaP2P;
class FUnifCuda;
//#define RANDOM_PARTICLES
......@@ -83,7 +83,7 @@ int main(int argc, char* argv[]){
typedef FStarPUCudaWrapper<GroupKernelClass,
FCudaEmptyCellSymb, int, int,
FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
FCudaGroupOfParticles<FReal, 4, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 4, 4, FReal>, FCudaP2P<FReal> > GroupCudaWrapper;
FCudaGroupOfParticles<FReal, 1, 4,