diff --git a/CMakeLists.txt b/CMakeLists.txt index e5a962af52497796a27cd1d74c88be77996f3133..e83c8d19852282c95ae82281abeca8079fd67937 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -455,7 +455,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/ message( FATAL_ERROR "nvcc is needed with CUDA." ) endif() if(NOT DEFINED CUSTOM_CUDA_FLAGS) - set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20" CACHE + set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;-ptxas-options=-v;-use_fast_math" CACHE STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math") endif() # This is needed to remove backslash after space in ADD_CUSTOM_COMMAND diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu index 8cdec810bdeddabad5aa10dbd72c4e712bd85113..53ff745ce694be40ec5ff9924b1c2dafe9ad77b3 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu @@ -1093,3 +1093,181 @@ template void FCuda__ReleaseCudaKernel<FCudaP2P<double>>(FCudaP2P<double>* cuker template dim3 FCuda__GetGridSize< FCudaP2P<double> >(FCudaP2P<double>* kernel, int intervalSize); template dim3 FCuda__GetBlockSize< FCudaP2P<double> >(FCudaP2P<double>* cukernel); + + + +///////////////////////////////////////////////////////////////////////////////////// +///////////////////////////////////////////////////////////////////////////////////// + +#include "../Uniform/FUnifCuda.hpp" + +template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr, +unsigned char* containersPtr, std::size_t containersSize, + FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#ifdef SCALFMM_USE_MPI +template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, + unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr, + int idxLevel, const OutOfBlockInteraction* outsideInteractions, + int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#endif +template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, + unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#ifdef SCALFMM_USE_MPI +template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + unsigned char* externalContainersPtr, std::size_t externalContainersSize, + const OutOfBlockInteraction* outsideInteractions, + int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell, +const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#endif +template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr, +const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, +const int safeOuterInteractions[], const int counterOuterCell, + const OutOfBlockInteraction* insideInteractions, + const int safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> > + (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr, + unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + FUnifCuda<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template FUnifCuda<float>* FCuda__BuildCudaKernel<FUnifCuda<float>>(void* kernel); +template void FCuda__ReleaseCudaKernel<FUnifCuda<float>>(FUnifCuda<float>* cukernel); + +template dim3 FCuda__GetGridSize< FUnifCuda<float> >(FUnifCuda<float>* kernel, int intervalSize); +template dim3 FCuda__GetBlockSize< FUnifCuda<float> >(FUnifCuda<float>* cukernel); + + + + +template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr, +unsigned char* containersPtr, std::size_t containersSize, + FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#ifdef SCALFMM_USE_MPI +template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, + unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr, + int idxLevel, const OutOfBlockInteraction* outsideInteractions, + int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#endif +template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, + unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#ifdef SCALFMM_USE_MPI +template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + unsigned char* externalContainersPtr, std::size_t externalContainersSize, + const OutOfBlockInteraction* outsideInteractions, + int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell, +const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); +#endif +template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr, +const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, +const int safeOuterInteractions[], const int counterOuterCell, + const OutOfBlockInteraction* insideInteractions, + const int safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, + FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> > + (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr, + unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr, + FUnifCuda<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); + +template FUnifCuda<double>* FCuda__BuildCudaKernel<FUnifCuda<double>>(void* kernel); +template void FCuda__ReleaseCudaKernel<FUnifCuda<double>>(FUnifCuda<double>* cukernel); + +template dim3 FCuda__GetGridSize< FUnifCuda<double> >(FUnifCuda<double>* kernel, int intervalSize); +template dim3 FCuda__GetBlockSize< FUnifCuda<double> >(FUnifCuda<double>* cukernel); diff --git a/Src/GroupTree/Uniform/FUnifCuda.hpp b/Src/GroupTree/Uniform/FUnifCuda.hpp new file mode 100644 index 0000000000000000000000000000000000000000..ad97f2efd14aace14b99b77c7c284210b1328056 --- /dev/null +++ b/Src/GroupTree/Uniform/FUnifCuda.hpp @@ -0,0 +1,342 @@ +#ifndef FUNIFCUDA_HPP +#define FUNIFCUDA_HPP + +#include "../Cuda/FCudaGlobal.hpp" +#include "../Cuda/FCudaGroupAttachedLeaf.hpp" +#include "../Cuda/FCudaEmptyCellSymb.hpp" +#include "../Cuda/FCudaCompositeCell.hpp" + +#include "FUnifCellPOD.hpp" + +#define Min(x,y) ((x)<(y)?(x):(y)) +#define Max(x,y) ((x)>(y)?(x):(y)) + +/** + * This class defines what should be a Cuda kernel. + */ +template <class FReal> +class FUnifCuda { +protected: +public: + + __device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const FReal& targetPhys, + FReal& forceX, FReal& forceY,FReal& forceZ, FReal& potential, + const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const { + FReal dx = sourcesX - targetX; + FReal dy = sourcesY - targetY; + FReal dz = sourcesZ - targetZ; + + FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz); + FReal inv_distance = sqrt(inv_square_distance); + + inv_square_distance *= inv_distance; + inv_square_distance *= targetPhys * sourcesPhys; + + dx *= inv_square_distance; + dy *= inv_square_distance; + dz *= inv_square_distance; + + forceX += dx; + forceY += dy; + forceZ += dz; + potential += inv_distance * sourcesPhys; + } + + static double DSqrt(const double val){ + return sqrt(val); + } + + static float FSqrt(const float val){ + return sqrtf(val); + } + + typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass; + typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass; + + static const int NB_THREAD_GROUPS = 30; // 2 x 15 + static const int THREAD_GROUP_SIZE = 256; + static const int SHARED_MEMORY_SIZE = 512;// 49152 + + __device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) { + } + + __device__ void M2M(CellClass /*pole*/, const CellClass /*child*/[8], const int /*level*/) { + } + + __device__ void M2L(CellClass /*pole*/, const CellClass* /*distantNeighbors*/, + const int* /*neighPositions*/, + const int /*size*/, const int /*level*/) { + } + + __device__ void L2L(const CellClass /*local*/, CellClass /*child*/[8], const int /*level*/) { + } + + __device__ void L2P(const CellClass /*local*/, ContainerClass*const /*particles*/){ + } + + __device__ void P2P(const int3& pos, + ContainerClass* const targets, const ContainerClass* const sources, + ContainerClass* const directNeighborsParticles, + const int* neighborPositions, const int counter){ + // Compute with other + P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter); + // Compute inside + const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x; + for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){ + const int idxPart = (idxLoop*blockDim.x)+threadIdx.x; + const bool threadCompute = (idxPart < targets->getNbParticles()); + + FReal targetX, targetY, targetZ, targetPhys; + FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); + + for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){ + __shared__ FReal sourcesX[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesY[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE]; + + const int nbCopies = Min(SHARED_MEMORY_SIZE, targets->getNbParticles()-idxCopy); + for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){ + sourcesX[idx] = targets->getPositions()[0][idx+idxCopy]; + sourcesY[idx] = targets->getPositions()[1][idx+idxCopy]; + sourcesZ[idx] = targets->getPositions()[2][idx+idxCopy]; + sourcesPhys[idx] = targets->getAttribute(0)[idx+idxCopy]; + } + + __syncthreads(); + + if(threadCompute){ + int leftCopies = nbCopies; + if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){ + leftCopies = idxPart - idxCopy; + } + + // Left Part + for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + } + + for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + } + // Right Part + for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + } + + for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + } + } + + __syncthreads(); + } + + if( threadCompute ){ + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; + } + + __syncthreads(); + } + } + + __device__ void P2PRemote(const int3& , + ContainerClass* const targets, const ContainerClass* const /*sources*/, + ContainerClass* const directNeighborsParticles, + const int* /*neighborsPositions*/, const int counter){ + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + + const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x; + for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){ + const int idxPart = (idxLoop*blockDim.x)+threadIdx.x; + const bool threadCompute = (idxPart < targets->getNbParticles()); + + FReal targetX, targetY, targetZ, targetPhys; + FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); + + for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){ + __shared__ FReal sourcesX[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesY[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE]; + + const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy); + for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){ + sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy]; + sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy]; + sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy]; + sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy]; + } + + __syncthreads(); + + if(threadCompute){ + for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + } + + for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + } + } + + __syncthreads(); + } + + if( threadCompute ){ + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; + } + + + __syncthreads(); + } + } + } + + __device__ void P2POuter(const int3& , + ContainerClass* const targets, + ContainerClass* const directNeighborsParticles, + const int* /*neighborsPositions*/, const int counter){ + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + + const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x; + for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){ + const int idxPart = (idxLoop*blockDim.x)+threadIdx.x; + const bool threadCompute = (idxPart < targets->getNbParticles()); + + FReal targetX, targetY, targetZ, targetPhys; + FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0; + + targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0); + targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0); + targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0); + targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0); + + for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){ + __shared__ FReal sourcesX[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesY[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE]; + __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE]; + + const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy); + for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){ + sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy]; + sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy]; + sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy]; + sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy]; + } + + __syncthreads(); + + if(threadCompute){ + for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4 + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]); + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]); + } + + for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero + DirectComputation(targetX, targetY, targetZ, targetPhys, + forceX, forceY, forceZ, potential, + sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]); + } + } + + __syncthreads(); + } + + if( threadCompute ){ + targets->getAttribute(1)[idxPart] += potential; + targets->getAttribute(2)[idxPart] += forceX; + targets->getAttribute(3)[idxPart] += forceY; + targets->getAttribute(4)[idxPart] += forceZ; + } + + __syncthreads(); + } + } + } + + __host__ static FUnifCuda* InitKernelKernel(void*){ + return nullptr; + } + + __host__ static void ReleaseKernel(FUnifCuda* /*todealloc*/){ + // nothing to do + } + + __host__ static dim3 GetGridSize(const int /*intervalSize*/){ + return NB_THREAD_GROUPS; + } + + __host__ static dim3 GetBlocksSize(){ + return THREAD_GROUP_SIZE; + } +}; + +#endif // FUNIFCUDA_HPP + diff --git a/Src/Kernels/Uniform/FUnifTensor.hpp b/Src/Kernels/Uniform/FUnifTensor.hpp index a3895283d30738c1b16a3b6150b3e4be9cc9c350..ab233ab57a03f2b684312c38744a70e1fa9e9ff5 100644 --- a/Src/Kernels/Uniform/FUnifTensor.hpp +++ b/Src/Kernels/Uniform/FUnifTensor.hpp @@ -18,7 +18,7 @@ #ifndef FUNIFTENSOR_HPP #define FUNIFTENSOR_HPP -#include "Utils/FMath.hpp" +#include "../../Utils/FMath.hpp" #include "FUnifRoots.hpp" #include "../Interpolation/FInterpTensor.hpp" diff --git a/Tests/GroupTree/testBlockedUnifCuda.cpp b/Tests/GroupTree/testBlockedUnifCuda.cpp index df87193cfa8c3641f3e9796081ab1b00f8ca3c00..63380c7f15a0ae63fe084c9c485a1c4fbff3fac3 100644 --- a/Tests/GroupTree/testBlockedUnifCuda.cpp +++ b/Tests/GroupTree/testBlockedUnifCuda.cpp @@ -47,7 +47,7 @@ #include <memory> template <class FReal> -class FCudaP2P; +class FUnifCuda; //#define RANDOM_PARTICLES @@ -83,7 +83,7 @@ int main(int argc, char* argv[]){ typedef FStarPUCudaWrapper<GroupKernelClass, FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, - FCudaGroupOfParticles<FReal, 4, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 4, 4, FReal>, FCudaP2P<FReal> > GroupCudaWrapper; + FCudaGroupOfParticles<FReal, 1, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 1, 4, FReal>, FUnifCuda<FReal> > GroupCudaWrapper; typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper, GroupCudaWrapper > GroupAlgorithm;