diff --git a/Src/GroupTree/Core/FGroupOfParticles.hpp b/Src/GroupTree/Core/FGroupOfParticles.hpp index 93478fc5ab679e466e6edfd4e8b8911a21631709..4225095238331ac3e9266d3aea9632163c061ff8 100644 --- a/Src/GroupTree/Core/FGroupOfParticles.hpp +++ b/Src/GroupTree/Core/FGroupOfParticles.hpp @@ -68,8 +68,6 @@ protected: BlockHeader* blockHeader; //< Pointer to leaves information LeafHeader* leafHeader; - //< The total number of particles in the group - const FSize nbParticlesInGroup; //< Pointers to particle position x, y, z FReal* particlePosition[3]; @@ -92,7 +90,7 @@ public: FGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, unsigned char* inAttributes) : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer), - blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0), + blockHeader(nullptr), leafHeader(nullptr), attributesBuffer(nullptr), deleteBuffer(false){ // Move the pointers to the correct position blockHeader = reinterpret_cast<BlockHeader*>(inBuffer); @@ -127,12 +125,12 @@ public: * @param inNumberOfLeaves total number of leaves in the interval (should be <= inEndingIndex-inEndingIndex) */ FGroupOfParticles(const MortonIndex inStartingIndex, const MortonIndex inEndingIndex, const int inNumberOfLeaves, const FSize inNbParticles) - : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(inNbParticles), + : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), deleteBuffer(true){ memset(particlePosition, 0, sizeof(particlePosition)); memset(particleAttributes, 0, sizeof(particleAttributes)); - const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(nbParticlesInGroup+(MemoryAlignementParticles-1)*inNumberOfLeaves); + const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(inNbParticles+(MemoryAlignementParticles-1)*inNumberOfLeaves); // Find the number of leaf to allocate in the blocks FAssertLF((inEndingIndex-inStartingIndex) >= MortonIndex(inNumberOfLeaves)); @@ -161,6 +159,7 @@ public: blockHeader->endingIndex = inEndingIndex; blockHeader->numberOfLeavesInBlock = inNumberOfLeaves; blockHeader->nbParticlesAllocatedInGroup = nbParticlesAllocatedInGroup; + blockHeader->nbParticlesInGroup = inNbParticles; // Init particle pointers blockHeader->positionsLeadingDim = (sizeof(FReal) * nbParticlesAllocatedInGroup); @@ -247,7 +246,7 @@ public: /** Get the total number of particles in the group */ FSize getNbParticlesInGroup() const { - return nbParticlesInGroup; + return blockHeader->nbParticlesInGroup; } /** The size of the interval endingIndex-startingIndex (set from the constructor) */ diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp index eba5a5cd192f366a62416b36e8132d0252d433fc..562e3789b23b31ec0cca5ed1df13a9e21d380293 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp @@ -250,6 +250,27 @@ public: #endif } + void syncData(){ + for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ + for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){ + starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].symb); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].up); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].down); + } + } + { + for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){ + starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R); + starpu_data_release(particleHandles[idxHandle].symb); + starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R); + starpu_data_release(particleHandles[idxHandle].down); + } + } + } + ~FGroupTaskStarPUAlgorithm(){ starpu_resume(); @@ -338,6 +359,11 @@ protected: FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" ); starpu_task_wait_for_all(); + + FLOG( FTic timerSync; ); + syncData(); + FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" ); + starpu_pause(); #ifdef STARPU_USE_CPU diff --git a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp index fb44ebdc6d04a9c15e2da2b631fdc8563613803d..2dc3e070d1c71216829fbb57a04a5887e4762eec 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp @@ -265,6 +265,27 @@ public: #endif } + void syncData(){ + for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ + for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){ + starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].symb); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].up); + starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R); + starpu_data_release(cellHandles[idxLevel][idxHandle].down); + } + } + { + for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){ + starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R); + starpu_data_release(particleHandles[idxHandle].symb); + starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R); + starpu_data_release(particleHandles[idxHandle].down); + } + } + } + ~FGroupTaskStarPUMpiAlgorithm(){ starpu_resume(); @@ -324,6 +345,7 @@ public: } protected: + /** * Runs the complete algorithm. */ @@ -362,6 +384,11 @@ protected: #endif starpu_task_wait_for_all(); + + FLOG( FTic timerSync; ); + syncData(); + FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" ); + starpu_pause(); #ifdef STARPU_USE_CPU diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu index cd1980ca2599ea422155b335674bc33e093c652b..4d8ff95c103fb033fc7a943bc2a3a84d85cbdfe6 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu @@ -31,12 +31,17 @@ __global__ void FCuda__bottomPassPerform(unsigned char* leafCellsPtr, std::size_ CudaKernelClass* kernel){ CellContainerClass leafCells(leafCellsPtr, leafCellsSize, leafCellsUpPtr, nullptr); ParticleContainerGroupClass containers(containersPtr, containersSize, nullptr); + printf("containers.getNbParticlesInGroup() %ld \n", containers.getNbParticlesInGroup()); + printf("containers.getSizeOfInterval() %ld \n", containers.getSizeOfInterval()); + printf("containers.getStartingIndex() %lld \n", containers.getStartingIndex()); + printf("containers.getEndingIndex() %lld \n", containers.getEndingIndex()); for(int leafIdx = blockIdx.x ; leafIdx < leafCells.getNumberOfCellsInBlock() ; leafIdx += gridDim.x){ typename CellContainerClass::CompleteCellClass cell = leafCells.getUpCell(leafIdx); ParticleGroupClass particles = containers.template getLeaf<ParticleGroupClass>(leafIdx); FCudaAssertLF(leafCells.getCellMortonIndex(leafIdx) == containers.getLeafMortonIndex(leafIdx)); kernel->P2M(cell, &particles); + printf("particles.getNbParticles() %lld \n", particles.getNbParticles()); } } @@ -65,70 +70,48 @@ __host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){ + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel){ CellContainerClass currentCells(currentCellsPtr, currentCellsSize,currentCellsUpPtr,nullptr); - CellContainerClass subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], subCellGroupsUpPtr.values[idx], nullptr); - } + CellContainerClass subCellGroup(childCellsPtr, childCellsSize,childCellsUpPtr,nullptr); - const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); - const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); + const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3); + const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3); - if(firstCell == currentCells.getNumberOfCellsInBlock()){ - return ; - } + int idxParentCell = currentCells.getCellIndex(firstParent); + int idxChildCell = subCellGroup.getFistChildIdx(firstParent); - FCudaAssertLF(nbSubCellGroups != 0); - int idxSubCellGroup = 0; - int idxChildCell = 0; - {// Find first child - const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell); - while(idxSubCellGroup != nbSubCellGroups - && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){ - idxSubCellGroup += 1; - } - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); - idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0)); - } - FCudaAssertLF(idxChildCell != -1); - - for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){ - typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(cellIdx); - FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx)); + while(true){ + typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(idxParentCell); typename CellContainerClass::CompleteCellClass child[8]; - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = nullptr; } - while(idxSubCellGroup != nbSubCellGroups - && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7); - FCudaAssertLF(child[idxChild].symb == nullptr); - child[idxChild] = subCellGroups[idxSubCellGroup].getUpCell(idxChildCell); + do{ + const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7); + child[idxChild] = subCellGroup.getUpCell(idxChildCell); idxChildCell += 1; - if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){ - idxChildCell = 0; - idxSubCellGroup += 1; - } - } + }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3)); kernel->M2M(cell, child, idxLevel); + + if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; } } template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ FCuda__upwardPassPerform @@ -136,8 +119,8 @@ __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::siz CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>> (currentCellsPtr, currentCellsSize,currentCellsUpPtr, - subCellGroupsPtr, subCellGroupsSize,subCellGroupsUpPtr, - nbSubCellGroups, idxLevel, kernel); + childCellsPtr, childCellsSize,childCellsUpPtr, + idxLevel, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); } @@ -272,17 +255,17 @@ __host__ void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std: template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsUpPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel){ if(blockIdx.x != 0){ return; } - CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr); - CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr); + CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr); + CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr); for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){ const int cellPos = cellsOther.getCellIndex(outsideInteractions[outInterIdx].outIndex); @@ -310,10 +293,10 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsDownPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsUpPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ OutOfBlockInteraction* cuOutsideInteractions; @@ -326,10 +309,10 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s <SymboleCellClass, PoleCellClass, LocalCellClass, CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>>(currentCellsPtr, currentCellsSize, - currentCellsUpPtr, currentCellsDownPtr, + currentCellsDownPtr, externalCellsPtr, externalCellsSize, - externalCellsUpPtr, externalCellsDownPtr, - idxLevel, cuOutsideInteractions, + externalCellsUpPtr, + idxLevel, mode, cuOutsideInteractions, nbOutsideInteractions, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); @@ -345,77 +328,56 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){ - FCudaAssertLF(nbSubCellGroups != 0); - CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr); - CellContainerClass subCellGroups[9]; - for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){ - subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], nullptr, subCellGroupsDownPtr.values[idx]); - } + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel){ + CellContainerClass currentCells(currentCellsPtr, currentCellsSize,nullptr,currentCellsDownPtr); + CellContainerClass subCellGroup(childCellsPtr, childCellsSize,nullptr,childCellsDownPtr); - const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); - const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x)); + const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3); + const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3); - if(firstCell == currentCells.getNumberOfCellsInBlock()){ - return ; - } + int idxParentCell = currentCells.getCellIndex(firstParent); + int idxChildCell = subCellGroup.getFistChildIdx(firstParent); - FCudaAssertLF(nbSubCellGroups != 0); - int idxSubCellGroup = 0; - int idxChildCell = 0; - {// Find first child - const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell); - while(idxSubCellGroup != nbSubCellGroups - && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){ - idxSubCellGroup += 1; - } - FCudaAssertLF(idxSubCellGroup != nbSubCellGroups); - idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0)); - } - FCudaAssertLF(idxChildCell != -1); - - for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){ - typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx); - FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx)); + while(true){ + typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(idxParentCell); typename CellContainerClass::CompleteCellClass child[8]; + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ child[idxChild].symb = nullptr; } - while(idxSubCellGroup != nbSubCellGroups - && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){ - const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7); - FCudaAssertLF(child[idxChild].symb == nullptr); - child[idxChild] = subCellGroups[idxSubCellGroup].getDownCell(idxChildCell); + do{ + const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7); + child[idxChild] = subCellGroup.getDownCell(idxChildCell); idxChildCell += 1; - if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){ - idxChildCell = 0; - idxSubCellGroup += 1; - } - } + }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3)); kernel->L2L(cell, child, idxLevel); + + if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; } } template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> __host__ void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize){ FCuda__downardPassPerform <SymboleCellClass, PoleCellClass, LocalCellClass, CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass> <<<inGridSize, inBlocksSize, 0, currentStream>>> - (currentCellsPtr, currentCellsSize, currentCellsDownPtr, subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr, - nbSubCellGroups, idxLevel, kernel); + (currentCellsPtr, currentCellsSize, currentCellsDownPtr, childCellsPtr, childCellsSize, childCellsDownPtr, + idxLevel, kernel); FCudaCheckAfterCall(); FCudaCheck(cudaStreamSynchronize(currentStream)); } @@ -682,9 +644,8 @@ template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroup template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, -FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, -FCudaParams<unsigned char*,9> subCellGroupsUpPtr, -int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, +unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -705,19 +666,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, +unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -770,9 +730,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -793,19 +752,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData, template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -856,9 +814,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -879,19 +836,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData, template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>, @@ -945,9 +901,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -968,19 +923,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -1031,9 +985,8 @@ unsigned char* containersPtr, std::size_t containersSize, template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, +int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, @@ -1054,19 +1007,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, - unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, - int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream, - const dim3 inGridSize, const dim3 inBlocksSize); +unsigned char* currentCellsDownPtr, +unsigned char* externalCellsPtr, std::size_t externalCellsSize, +unsigned char* externalCellsUpPtr, +int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, +int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream, + const dim3 inGridSize, const dim3 inBlocksSize); template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> > (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, +int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>, diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp index 5559c908570cc75388ecfa1c2018ab4e98d53c59..4ba6615534e597cca892fbd02ce44a5c93a6e8cf 100644 --- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp +++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp @@ -17,9 +17,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__upwardPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsUpPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, @@ -43,10 +42,10 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__transferInoutPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, - unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr, + unsigned char* currentCellsUpPtr, unsigned char* externalCellsPtr, std::size_t externalCellsSize, - unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr, - int idxLevel, const OutOfBlockInteraction* outsideInteractions, + unsigned char* externalCellsDownPtr, + int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); @@ -54,9 +53,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass> void FCuda__downardPassCallback( unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr, - FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize, - FCudaParams<unsigned char*,9> subCellGroupsDownPtr, - int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, + unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr, + int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream, const dim3 inGridSize, const dim3 inBlocksSize); #ifdef SCALFMM_USE_MPI template <class SymboleCellClass, class PoleCellClass, class LocalCellClass, diff --git a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp index d7d8aff5cea8627d3ad487b3e1286fc1af7bed2b..a119cc22ed8b35c53fc0fc835a621fb1ce15e95f 100644 --- a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp +++ b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp @@ -59,8 +59,6 @@ protected: BlockHeader* blockHeader; //< Pointer to leaves information LeafHeader* leafHeader; - //< The total number of particles in the group - const FSize nbParticlesInGroup; //< Pointers to particle position x, y, z FReal* particlePosition[3]; @@ -78,11 +76,12 @@ public: __device__ FCudaGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte, unsigned char* inAttributes) : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer), - blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0), + blockHeader(nullptr), leafHeader(nullptr), attributesBuffer(nullptr){ // Move the pointers to the correct position - blockHeader = reinterpret_cast<BlockHeader*>(memoryBuffer); - leafHeader = reinterpret_cast<LeafHeader*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->numberOfLeavesInBlock*sizeof(int))); + blockHeader = reinterpret_cast<BlockHeader*>(inBuffer); + inBuffer += sizeof(BlockHeader); + leafHeader = reinterpret_cast<LeafHeader*>(inBuffer); // Init particle pointers // Assert blockHeader->positionsLeadingDim == (sizeof(FReal) * blockHeader->nbParticlesAllocatedInGroup); @@ -122,7 +121,7 @@ public: /** Get the total number of particles in the group */ __device__ FSize getNbParticlesInGroup() const { - return nbParticlesInGroup; + return blockHeader->nbParticlesInGroup; } /** The size of the interval endingIndex-startingIndex (set from the constructor) */ diff --git a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp index b0b2679d12ec8c5b8feda2a6cdc8f2ad4c7580ae..5e438872e83e205ad5fdc3638c822eae068d6c2f 100644 --- a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp +++ b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp @@ -96,21 +96,9 @@ public: FStarPUPtrInterface* worker = nullptr; int nbSubCellGroups = 0; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - FCudaParams<unsigned char*,9> subCellGroupsPtr; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); - FCudaParams<std::size_t,9> subCellGroupsSize; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); - FCudaParams<unsigned char*,9> subCellGroupsUpPtr; - memset(&subCellGroupsUpPtr, 0, sizeof(subCellGroupsUpPtr)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]); - subCellGroupsUpPtr.values[idxSubGroup] = (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]); - } - CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass, @@ -118,8 +106,10 @@ public: (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), - subCellGroupsPtr,subCellGroupsSize,subCellGroupsUpPtr, - nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), + idxLevel, kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } @@ -130,8 +120,8 @@ public: static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - const std::vector<OutOfBlockInteraction>* outsideInteractions; - int intervalSize; + const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -156,7 +146,7 @@ public: static void transferInPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -174,9 +164,10 @@ public: static void transferInoutPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; int idxLevel = 0; - const std::vector<OutOfBlockInteraction>* outsideInteractions; - int intervalSize; - starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize); + const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; + int intervalSize = 0; + int mode = 0; + starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -186,11 +177,9 @@ public: STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), - STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]), - (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[4]), - (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[5]), - idxLevel, outsideInteractions->data(), int(outsideInteractions->size()), kernel, + idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } @@ -202,21 +191,9 @@ public: FStarPUPtrInterface* worker = nullptr; int nbSubCellGroups = 0; int idxLevel = 0; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize); - FCudaParams<unsigned char*,9> subCellGroupsPtr; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr)); - FCudaParams<std::size_t,9> subCellGroupsSize; - memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize)); - FCudaParams<unsigned char*,9> subCellGroupsDownPtr; - memset(&subCellGroupsDownPtr, 0, sizeof(subCellGroupsDownPtr)); - for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){ - subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2])); - subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2])); - subCellGroupsDownPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3])); - } - CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass, @@ -224,8 +201,10 @@ public: (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]), STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]), (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]), - subCellGroupsPtr,subCellGroupsSize,subCellGroupsDownPtr, - nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]), + STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]), + (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]), + idxLevel, kernel, starpu_cuda_get_local_stream(), FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel)); } ///////////////////////////////////////////////////////////////////////////////////// @@ -237,7 +216,7 @@ public: FStarPUPtrInterface* worker = nullptr; const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -260,7 +239,7 @@ public: static void directInPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; @@ -276,7 +255,7 @@ public: static void directInoutPassCallback(void *buffers[], void *cl_arg){ FStarPUPtrInterface* worker = nullptr; const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr; - int intervalSize; + int intervalSize = 0; starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize); CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()]; diff --git a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp index 23325ea829ea174b4d1fa7946d24895c2e1b4259..5cd96a6d0aec89451aa46111752cf4a71656310e 100644 --- a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp +++ b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp @@ -18,6 +18,8 @@ public: // the pole represents all particles under if(threadIdx.x == 0){ *pole.up += particles->getNbParticles(); + printf("*pole.up %d += particles->getNbParticles(); %d \n", + *pole.up , particles->getNbParticles()); } } diff --git a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp index 07015cfa8232431bddcfe78a771817f07437114e..522418cb77d05623f62cc10ed0d6808a13f863d4 100644 --- a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp +++ b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp @@ -80,7 +80,7 @@ int main(int argc, char* argv[]){ typedef FGroupTestParticleContainer<FReal> GroupContainerClass; typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 0, 1, long long int> GroupOctreeClass; - typedef FStarPUAllCpuCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass; + typedef FStarPUAllCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >> GroupKernelClass; typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper; typedef FStarPUCudaWrapper<GroupKernelClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,