diff --git a/Examples/RotationMPIFMM.cpp b/Examples/RotationMPIFMM.cpp index 50084eb8611fec973cd27444bcf7d489389b26e6..978df5f73bcc22b128da2252d278a55f60685ced 100644 --- a/Examples/RotationMPIFMM.cpp +++ b/Examples/RotationMPIFMM.cpp @@ -164,6 +164,8 @@ int main(int argc, char* argv[]) tree.getBoxWidth(), tree.getHeight(), &finalParticles,&balancer); + std::cout << "Local nb particles after sort " + << finalParticles.getSize() << std::endl; for(FSize idx = 0 ; idx < finalParticles.getSize(); ++idx){ tree.insert(finalParticles[idx].position,finalParticles[idx].indexInFile,finalParticles[idx].physicalValue); } diff --git a/Licence.txt b/Licence.txt index 7be5457ef5d7c4c4357d70b515d5019756b5b8ef..dbaa609a5fcf69ca7343bdb582a3b71f374f3a33 100644 --- a/Licence.txt +++ b/Licence.txt @@ -4,6 +4,9 @@ the more restrictive has to be used. ScalFmm est régi par la licence CeCILL-C & LGPL, en cas de conflit la plus restrictive prime. +Folders under Addons might have separate Licence, in such case +one can find a dedicated Licence file where appropriate. + ////////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////// diff --git a/Src/BalanceTree/FEqualize.hpp b/Src/BalanceTree/FEqualize.hpp index b0c46806c548a4df08200c0075bf623a01634612..95caba72ad4e7dc08805001ded2feb3a0e3f12a7 100644 --- a/Src/BalanceTree/FEqualize.hpp +++ b/Src/BalanceTree/FEqualize.hpp @@ -81,7 +81,10 @@ public: pack.elementTo = Min(allObjectives[idxProc].second , myCurrentInterval.second) - myCurrentInterval.first; // Next time give from the previous end currentElement = pack.elementTo; - packToSend.push_back(pack); + + if(pack.elementTo - pack.elementFrom != 0){ + packToSend.push_back(pack); + } // Progress idxProc += 1; } diff --git a/Src/BalanceTree/FPartitionsMapping.hpp b/Src/BalanceTree/FPartitionsMapping.hpp new file mode 100644 index 0000000000000000000000000000000000000000..50f2525978cd28204f038afa9d8fb062f0f32336 --- /dev/null +++ b/Src/BalanceTree/FPartitionsMapping.hpp @@ -0,0 +1,316 @@ +// =================================================================================== +// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner +// olivier.coulaud@inria.fr, berenger.bramas@inria.fr +// This software is a computer program whose purpose is to compute the FMM. +// +// This software is governed by the CeCILL-C and LGPL licenses and +// abiding by the rules of distribution of free software. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public and CeCILL-C Licenses for more details. +// "http://www.cecill.info". +// "http://www.gnu.org/licenses". +// =================================================================================== +#ifndef FPARTITIONSMAPPING_HPP +#define FPARTITIONSMAPPING_HPP + +#include "Utils/FGlobal.hpp" +#include "Utils/FMpi.hpp" +#include "Containers/FVector.hpp" +#include "FLeafBalance.hpp" +#include "Files/FMpiTreeBuilder.hpp" + +template <class FReal> +class FPartitionsMapping { +protected: + FMpi::FComm comm; + + //! The number of particles from the initial decomposition + FSize nbParticlesInitial; + //! The number of particles from the scalfmm decomposition + FSize nbParticlesWorking; + + std::unique_ptr<FSize[]> nbParticlesSentToOthers; + std::unique_ptr<FSize[]> offsetNbParticlesSentToOthers; + std::unique_ptr<FSize[]> nbParticlesRecvFromOthers; + std::unique_ptr<FSize[]> offsetNbParticlesRecvFromOthers; + + std::unique_ptr<FSize[]> mappingToOthers; + + +public: + FPartitionsMapping(const FMpi::FComm& inComm) + : comm(inComm), nbParticlesInitial(0), nbParticlesWorking(0) { + } + + void setComm(const FMpi::FComm& inComm){ + comm = inComm; + } + + template< int NbPhysicalValuesPerPart> + struct TestParticle{ + FPoint<FReal> position; + std::array<FReal, NbPhysicalValuesPerPart> physicalValues; + FSize localIndex; + int initialProcOwner; + + const FPoint<FReal>& getPosition() const { + return position; + } + }; + + template< int NbPhysicalValuesPerPart, class FillerClass> + FVector<TestParticle<NbPhysicalValuesPerPart>> distributeParticles(const FSize inNbParticles, + const FPoint<FReal>& centerOfBox, const FReal boxWidth, + const int TreeHeight, FillerClass filler){ + nbParticlesInitial = inNbParticles; + + //////////////////////////////////////////////////////// + + std::unique_ptr<TestParticle<NbPhysicalValuesPerPart>[]> initialParticles(new TestParticle<NbPhysicalValuesPerPart>[inNbParticles]); + + // Create the array to distribute + for(int idxPart = 0 ; idxPart < nbParticlesInitial ; ++idxPart){ + filler(idxPart, &initialParticles[idxPart].position, &initialParticles[idxPart].physicalValues); + initialParticles[idxPart].localIndex = idxPart; + initialParticles[idxPart].initialProcOwner = comm.processId(); + } + + FVector<TestParticle<NbPhysicalValuesPerPart>> finalParticles; + FLeafBalance balancer; + FMpiTreeBuilder< FReal,TestParticle<NbPhysicalValuesPerPart> >::DistributeArrayToContainer(comm,initialParticles.get(), + nbParticlesInitial, + centerOfBox, + boxWidth, + TreeHeight, + &finalParticles, &balancer); + + FQuickSort<TestParticle<NbPhysicalValuesPerPart>,FSize>::QsOmp(finalParticles.data(), finalParticles.getSize(), + [](const TestParticle<NbPhysicalValuesPerPart>& p1, + const TestParticle<NbPhysicalValuesPerPart>& p2){ + return p1.initialProcOwner < p2.initialProcOwner + || (p1.initialProcOwner == p2.initialProcOwner + && p1.localIndex < p2.localIndex); + }); + + //////////////////////////////////////////////////////// + + nbParticlesWorking = finalParticles.getSize(); + + nbParticlesRecvFromOthers.reset(new FSize[comm.processCount()]); + memset(nbParticlesRecvFromOthers.get(), 0 , sizeof(FSize)*comm.processCount()); + + for(FSize idxPart = 0 ; idxPart < finalParticles.getSize() ; ++idxPart){ + // Count the particles received from each proc + nbParticlesRecvFromOthers[finalParticles[idxPart].initialProcOwner] += 1; + + } + + offsetNbParticlesRecvFromOthers.reset(new FSize[comm.processCount()+1]); + offsetNbParticlesRecvFromOthers[0] = 0; + + for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){ + offsetNbParticlesRecvFromOthers[idxProc+1] = offsetNbParticlesRecvFromOthers[idxProc] + + nbParticlesRecvFromOthers[idxProc]; + } + + //////////////////////////////////////////////////////// + + std::unique_ptr<FSize[]> nbParticlesRecvFromOthersAllAll(new FSize[comm.processCount()*comm.processCount()]); + // Exchange how many each proc receive from aother + FMpi::MpiAssert( MPI_Allgather( nbParticlesRecvFromOthers.get(), comm.processCount(), FMpi::GetType(FSize()), + nbParticlesRecvFromOthersAllAll.get(), comm.processCount(), + FMpi::GetType(FSize()), comm.getComm()), __LINE__ ); + + //////////////////////////////////////////////////////// + + nbParticlesSentToOthers.reset(new FSize[comm.processCount()]); + FSize checkerSent = 0; + offsetNbParticlesSentToOthers.reset(new FSize[comm.processCount()+1]); + offsetNbParticlesSentToOthers[0] = 0; + + for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){ + nbParticlesSentToOthers[idxProc] = nbParticlesRecvFromOthersAllAll[comm.processCount()*idxProc + comm.processId()]; + checkerSent += nbParticlesSentToOthers[idxProc]; + offsetNbParticlesSentToOthers[idxProc+1] = offsetNbParticlesSentToOthers[idxProc] + + nbParticlesSentToOthers[idxProc]; + } + // I must have send what I was owning at the beginning + FAssertLF(checkerSent == nbParticlesInitial); + + //////////////////////////////////////////////////////// + + std::unique_ptr<FSize[]> localIdsRecvOrdered(new FSize[nbParticlesWorking]); + + // We list the local id in order of the different proc + for(FSize idxPart = 0 ; idxPart < finalParticles.getSize() ; ++idxPart){ + const int procOwner = finalParticles[idxPart].initialProcOwner; + localIdsRecvOrdered[idxPart] = finalParticles[idxPart].localIndex; + FAssertLF(offsetNbParticlesRecvFromOthers[procOwner] <= idxPart + && idxPart < offsetNbParticlesRecvFromOthers[procOwner+1]); + } + + //////////////////////////////////////////////////////// + + std::unique_ptr<FSize[]> localIdsSendOrdered(new FSize[nbParticlesInitial]); + std::unique_ptr<MPI_Request[]> requests(new MPI_Request[comm.processCount()*2]); + + int iterRequest = 0; + for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){ + if(idxProc == comm.processId()){ + FAssertLF(nbParticlesRecvFromOthers[idxProc] == nbParticlesSentToOthers[idxProc]); + memcpy(&localIdsSendOrdered[offsetNbParticlesSentToOthers[idxProc]], + &localIdsRecvOrdered[offsetNbParticlesRecvFromOthers[idxProc]], + sizeof(FSize)*nbParticlesRecvFromOthers[idxProc]); + } + else{ + const FSize nbRecvFromProc = nbParticlesRecvFromOthers[idxProc]; + if(nbRecvFromProc){ + FMpi::MpiAssert( MPI_Isend(&localIdsRecvOrdered[offsetNbParticlesRecvFromOthers[idxProc]], + int(nbRecvFromProc), + FMpi::GetType(FSize()), idxProc, + 99, comm.getComm(), &requests[iterRequest++]), __LINE__ ); + } + const FSize nbSendToProc = nbParticlesSentToOthers[idxProc]; + if(nbSendToProc){ + FMpi::MpiAssert( MPI_Irecv(&localIdsSendOrdered[offsetNbParticlesSentToOthers[idxProc]], + int(nbSendToProc), + FMpi::GetType(FSize()), idxProc, + 99, comm.getComm(), &requests[iterRequest++]), __LINE__ ); + } + } + } + + FMpi::MpiAssert( MPI_Waitall( iterRequest, requests.get(), MPI_STATUSES_IGNORE), __LINE__ ); + + //////////////////////////////////////////////////////// + + mappingToOthers.reset(new FSize[nbParticlesInitial]); + for(FSize idxPart = 0; idxPart < nbParticlesInitial ; ++idxPart){ + mappingToOthers[localIdsSendOrdered[idxPart]] = idxPart; + } + + return std::move(finalParticles); + } + + //////////////////////////////////////////////////////// + + // physicalValues must be of size nbParticlesInitial + template< int NbPhysicalValuesPerPart> + std::unique_ptr<std::array<FReal, NbPhysicalValuesPerPart>[]> distributeData( + const std::array<FReal, NbPhysicalValuesPerPart> physicalValues[]){ + std::unique_ptr<std::array<FReal, NbPhysicalValuesPerPart>[]> physicalValuesRorder( + new std::array<FReal, NbPhysicalValuesPerPart>[nbParticlesInitial]); + + for(FSize idxPart = 0; idxPart < nbParticlesInitial ; ++idxPart){ + physicalValuesRorder[mappingToOthers[idxPart]] = physicalValues[idxPart]; + } + + // Allocate the array to store the physical values of my working interval + std::unique_ptr<std::array<FReal, NbPhysicalValuesPerPart>[]> recvPhysicalValues(new std::array<FReal, NbPhysicalValuesPerPart>[nbParticlesWorking]); + + std::unique_ptr<MPI_Request[]> requests(new MPI_Request[comm.processCount()*2]); + int iterRequest = 0; + for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){ + if(idxProc == comm.processId()){ + FAssertLF(nbParticlesRecvFromOthers[idxProc] == nbParticlesSentToOthers[idxProc]); + memcpy(&recvPhysicalValues[offsetNbParticlesRecvFromOthers[idxProc]], + &physicalValuesRorder[offsetNbParticlesSentToOthers[idxProc]], + sizeof(std::array<FReal, NbPhysicalValuesPerPart>)*nbParticlesRecvFromOthers[idxProc]); + } + else{ + const FSize nbSendToProc = nbParticlesSentToOthers[idxProc]; + if(nbSendToProc){ + FMpi::MpiAssert( MPI_Isend( + const_cast<std::array<FReal, NbPhysicalValuesPerPart>*>(&physicalValuesRorder[offsetNbParticlesSentToOthers[idxProc]]), + int(nbSendToProc*sizeof(std::array<FReal, NbPhysicalValuesPerPart>)), + MPI_BYTE, idxProc, + 2222, comm.getComm(), &requests[iterRequest++]), __LINE__ );; + } + const FSize nbRecvFromProc = nbParticlesRecvFromOthers[idxProc]; + if(nbRecvFromProc){ + FMpi::MpiAssert( MPI_Irecv( + (void*)&recvPhysicalValues[offsetNbParticlesRecvFromOthers[idxProc]], + int(nbRecvFromProc*sizeof(std::array<FReal, NbPhysicalValuesPerPart>)), + MPI_INT, idxProc, + 2222, comm.getComm(), &requests[iterRequest++]), __LINE__ );; + } + } + } + + FMpi::MpiAssert( MPI_Waitall( iterRequest, requests.get(), MPI_STATUSES_IGNORE), __LINE__ ); + + return std::move(recvPhysicalValues); + } + + //////////////////////////////////////////////////////// + + // resValues must be of size nbParticlesWorking + template< int NbResValuesPerPart> + std::unique_ptr<std::array<FReal, NbResValuesPerPart>[]> getResultingData( + const std::array<FReal, NbResValuesPerPart> resValues[]){ + // First allocate the array to store the result + std::unique_ptr<std::array<FReal, NbResValuesPerPart>[]> recvPhysicalValues( + new std::array<FReal, NbResValuesPerPart>[nbParticlesInitial]); + + std::unique_ptr<MPI_Request[]> requests(new MPI_Request[comm.processCount()*2]); + int iterRequest = 0; + for(int idxProc = 0 ; idxProc < comm.processCount() ; ++idxProc){ + if(idxProc == comm.processId()){ + FAssertLF(nbParticlesRecvFromOthers[idxProc] == nbParticlesSentToOthers[idxProc]); + memcpy(&recvPhysicalValues[offsetNbParticlesSentToOthers[idxProc]], + &resValues[offsetNbParticlesRecvFromOthers[idxProc]], + sizeof(std::array<FReal, NbResValuesPerPart>)*nbParticlesRecvFromOthers[idxProc]); + } + else{ + // I originaly receive nbRecvFromProc, so I should + // send nbRecvFromProc back to the real owner + const FSize nbRecvFromProc = nbParticlesRecvFromOthers[idxProc]; + if(nbRecvFromProc){ + FMpi::MpiAssert( MPI_Isend( + const_cast<std::array<FReal, NbResValuesPerPart>*>(&resValues[offsetNbParticlesRecvFromOthers[idxProc]]), + int(nbRecvFromProc*sizeof(std::array<FReal, NbResValuesPerPart>)), + MPI_BYTE, idxProc, + 1111, comm.getComm(), &requests[iterRequest++]), __LINE__ );; + } + // I sent nbSendToProc to idxProc, + // so I should receive nbSendToProc in my interval + const FSize nbSendToProc = nbParticlesSentToOthers[idxProc]; + if(nbSendToProc){ + FMpi::MpiAssert( MPI_Irecv( + &recvPhysicalValues[offsetNbParticlesSentToOthers[idxProc]], + int(nbSendToProc*sizeof(std::array<FReal, NbResValuesPerPart>)), + MPI_BYTE, idxProc, + 1111, comm.getComm(), &requests[iterRequest++]), __LINE__ );; + } + } + } + + FMpi::MpiAssert( MPI_Waitall( iterRequest, requests.get(), MPI_STATUSES_IGNORE), __LINE__ ); + + + std::unique_ptr<std::array<FReal, NbResValuesPerPart>[]> recvPhysicalValuesOrder( + new std::array<FReal, NbResValuesPerPart>[nbParticlesInitial]); + + for(FSize idxPart = 0; idxPart < nbParticlesInitial ; ++idxPart){ + recvPhysicalValuesOrder[idxPart] = recvPhysicalValues[mappingToOthers[idxPart]]; + } + + return std::move(recvPhysicalValuesOrder); + } + + //////////////////////////////////////////////////////// + + FSize getNbParticlesWorking() const{ + return nbParticlesWorking; + } + + FSize getMappingResultToLocal(const FSize inIdx) const{ + return mappingToOthers[inIdx]; + } + +}; + +#endif diff --git a/Src/Containers/FOctree.hpp b/Src/Containers/FOctree.hpp index 54cf81058543e83f2681d180ba420ebeb62e0d4b..832f7a782896717983ddb7b4662ebd71fbea9600 100644 --- a/Src/Containers/FOctree.hpp +++ b/Src/Containers/FOctree.hpp @@ -1685,6 +1685,10 @@ public: * @param function */ void forEachLeaf(std::function<void(LeafClass*)> function){ + if(isEmpty()){ + return; + } + Iterator octreeIterator(this); octreeIterator.gotoBottomLeft(); @@ -1698,6 +1702,10 @@ public: * @param function */ void forEachCell(std::function<void(CellClass*)> function){ + if(isEmpty()){ + return; + } + Iterator octreeIterator(this); octreeIterator.gotoBottomLeft(); @@ -1717,6 +1725,10 @@ public: * @param function */ void forEachCellWithLevel(std::function<void(CellClass*,const int)> function){ + if(isEmpty()){ + return; + } + Iterator octreeIterator(this); octreeIterator.gotoBottomLeft(); @@ -1736,6 +1748,10 @@ public: * @param function */ void forEachCellLeaf(std::function<void(CellClass*,LeafClass*)> function){ + if(isEmpty()){ + return; + } + Iterator octreeIterator(this); octreeIterator.gotoBottomLeft(); diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp index 34812c09bc31aefc72fc101f1942433ccfb3ad96..86fe46bb186b200166c4abb47c09f99225e34179 100644 --- a/Src/Core/FFmmAlgorithmThreadProc.hpp +++ b/Src/Core/FFmmAlgorithmThreadProc.hpp @@ -72,17 +72,20 @@ private: OctreeClass* const tree; ///< The octree to work on KernelClass** kernels; ///< The kernels - const FMpi::FComm& comm; ///< MPI comm + const FMpi::FComm comm; ///< MPI comm + FMpi::FComm fcomCompute; /// Used to store pointers to cells/leafs to work with - typename OctreeClass::Iterator* iterArray; + typename OctreeClass::Iterator* iterArray; /// Used to store pointers to cells/leafs to send/rcv typename OctreeClass::Iterator* iterArrayComm; int numberOfLeafs; ///< To store the size at the previous level const int MaxThreads; ///< Max number of thread allowed by openmp - const int nbProcess; ///< Process count - const int idProcess; ///< Current process id + const int nbProcessOrig; ///< Process count + const int idProcessOrig; ///< Current process id + int nbProcess; ///< Process count + int idProcess; ///< Current process id const int OctreeHeight; ///< Tree height const int userChunkSize; @@ -149,12 +152,15 @@ public: tree(inTree), kernels(nullptr), comm(inComm), + fcomCompute(inComm), iterArray(nullptr), iterArrayComm(nullptr), numberOfLeafs(0), MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())), - nbProcess(inComm.processCount()), - idProcess(inComm.processId()), + nbProcessOrig(inComm.processCount()), + idProcessOrig(inComm.processId()), + nbProcess(0), + idProcess(0), OctreeHeight(tree->getHeight()), userChunkSize(inUserChunkSize), leafLevelSeparationCriteria(inLeafLevelSeperationCriteria), @@ -164,9 +170,9 @@ public: FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3"); this->kernels = new KernelClass*[MaxThreads]; - #pragma omp parallel num_threads(MaxThreads) +#pragma omp parallel num_threads(MaxThreads) { - #pragma omp critical (InitFFmmAlgorithmThreadProc) +#pragma omp critical (InitFFmmAlgorithmThreadProc) { this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels); } @@ -175,7 +181,7 @@ public: FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight()); FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n"); - FLOG(FLog::Controller << "Max threads = " << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n"); + FLOG(FLog::Controller << "Max threads = " << MaxThreads << ", Procs = " << nbProcessOrig << ", I am " << idProcessOrig << ".\n"); FLOG(FLog::Controller << "Chunck Size = " << userChunkSize << "\n"); } @@ -196,136 +202,158 @@ protected: * Call this function to run the complete algorithm */ void executeCore(const unsigned operationsToProceed) override { - // Count leaf + // We are not involve if the tree is empty + const int iHaveParticles = (!tree->isEmpty()); + + std::unique_ptr<int[]> hasParticles(new int[comm.processCount()]); + FMpi::Assert( MPI_Allgather(const_cast<int*>(&iHaveParticles), 1,MPI_INT, + hasParticles.get(), 1, MPI_INT, + comm.getComm()), __LINE__); + + fcomCompute = FMpi::FComm(comm); + fcomCompute.groupReduce(hasParticles.get()); + + if(iHaveParticles){ + + nbProcess = fcomCompute.processCount(); + idProcess = fcomCompute.processId(); + + FLOG(FLog::Controller << "Max threads = " << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n"); + + // Count leaf #ifdef SCALFMM_TRACE_ALGO - eztrace_resume(); + eztrace_resume(); #endif - this->numberOfLeafs = 0; - { - Interval myFullInterval; - {//Building the interval with the first and last leaves (and count the number of leaves) - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.gotoBottomLeft(); - myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex(); - do{ - ++this->numberOfLeafs; - } while(octreeIterator.moveRight()); - myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex(); - } - // Allocate a number to store the pointer of the cells at a level - iterArray = new typename OctreeClass::Iterator[numberOfLeafs]; - iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs]; - FAssertLF(iterArray, "iterArray bad alloc"); - FAssertLF(iterArrayComm, "iterArrayComm bad alloc"); - - // We get the leftIndex/rightIndex indexes from each procs - FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()), __LINE__ ); - - // Build my intervals for all levels - std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]); - // At leaf level we know it is the full interval - myIntervals[OctreeHeight - 1] = myFullInterval; - - // We can estimate the interval for each level by using the parent/child relation - for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){ - myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3; - myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3; - } + this->numberOfLeafs = 0; + { + Interval myFullInterval; + {//Building the interval with the first and last leaves (and count the number of leaves) + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.gotoBottomLeft(); + myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex(); + do{ + ++this->numberOfLeafs; + } while(octreeIterator.moveRight()); + myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex(); + } + // Allocate a number to store the pointer of the cells at a level + iterArray = new typename OctreeClass::Iterator[numberOfLeafs]; + iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs]; + FAssertLF(iterArray, "iterArray bad alloc"); + FAssertLF(iterArrayComm, "iterArrayComm bad alloc"); + + // We get the leftIndex/rightIndex indexes from each procs + FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, fcomCompute.getComm()), __LINE__ ); + + // Build my intervals for all levels + std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]); + // At leaf level we know it is the full interval + myIntervals[OctreeHeight - 1] = myFullInterval; + + // We can estimate the interval for each level by using the parent/child relation + for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){ + myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3; + myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3; + } - // Process 0 uses the estimates as real intervals, but other processes - // should remove cells that belong to others - if(idProcess != 0){ - //We test for each level if process on left (idProcess-1) own cell I thought I owned - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.gotoBottomLeft(); - octreeIterator.moveUp(); + // Process 0 uses the estimates as real intervals, but other processes + // should remove cells that belong to others + if(idProcess != 0){ + //We test for each level if process on left (idProcess-1) own cell I thought I owned + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.gotoBottomLeft(); + octreeIterator.moveUp(); - // At h-1 the working limit is the parent of the right cell of the proc on the left - MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3; + // At h-1 the working limit is the parent of the right cell of the proc on the left + MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3; - // We check if we have no more work to do - int nullIntervalFromLevel = 0; + // We check if we have no more work to do + int nullIntervalFromLevel = 0; - for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){ - while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){ - if( !octreeIterator.moveRight() ){ - // We cannot move right we are not owner of any more cell - nullIntervalFromLevel = idxLevel; - break; + for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){ + while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){ + if( !octreeIterator.moveRight() ){ + // We cannot move right we are not owner of any more cell + nullIntervalFromLevel = idxLevel; + break; + } + } + // If we are responsible for some cells at this level keep the first index + if(nullIntervalFromLevel == 0){ + myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex(); + octreeIterator.moveUp(); + workingLimitAtLevel >>= 3; } } - // If we are responsible for some cells at this level keep the first index - if(nullIntervalFromLevel == 0){ - myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex(); - octreeIterator.moveUp(); - workingLimitAtLevel >>= 3; + // In case we are not responsible for any cells we put the leftIndex = rightIndex+1 + for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){ + myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1; } } - // In case we are not responsible for any cells we put the leftIndex = rightIndex+1 - for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){ - myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1; - } - } - // We get the leftIndex/rightIndex indexes from each procs - FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, - workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()), __LINE__ ); - } + // We get the leftIndex/rightIndex indexes from each procs + FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, + workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, fcomCompute.getComm()), __LINE__ ); + } #ifdef SCALFMM_TRACE_ALGO - eztrace_enter_event("P2M", EZTRACE_YELLOW); + eztrace_enter_event("P2M", EZTRACE_YELLOW); #endif - Timers[P2MTimer].tic(); - if(operationsToProceed & FFmmP2M) bottomPass(); - Timers[P2MTimer].tac(); + Timers[P2MTimer].tic(); + if(operationsToProceed & FFmmP2M) bottomPass(); + Timers[P2MTimer].tac(); #ifdef SSCALFMM_TRACE_ALGO - eztrace_leave_event(); - eztrace_enter_event("M2M", EZTRACE_PINK); + eztrace_leave_event(); + eztrace_enter_event("M2M", EZTRACE_PINK); #endif - Timers[M2MTimer].tic(); - if(operationsToProceed & FFmmM2M) upwardPass(); - Timers[M2MTimer].tac(); + Timers[M2MTimer].tic(); + if(operationsToProceed & FFmmM2M) upwardPass(); + Timers[M2MTimer].tac(); #ifdef SCALFMM_TRACE_ALGO - eztrace_leave_event(); - eztrace_enter_event("M2L", EZTRACE_GREEN); + eztrace_leave_event(); + eztrace_enter_event("M2L", EZTRACE_GREEN); #endif - Timers[M2LTimer].tic(); - if(operationsToProceed & FFmmM2L) transferPass(); - Timers[M2LTimer].tac(); + Timers[M2LTimer].tic(); + if(operationsToProceed & FFmmM2L) transferPass(); + Timers[M2LTimer].tac(); - #ifdef SCALFMM_TRACE_ALGO - eztrace_leave_event(); - eztrace_enter_event("L2L", EZTRACE_PINK); +#ifdef SCALFMM_TRACE_ALGO + eztrace_leave_event(); + eztrace_enter_event("L2L", EZTRACE_PINK); #endif - Timers[L2LTimer].tic(); - if(operationsToProceed & FFmmL2L) downardPass(); - Timers[L2LTimer].tac(); + Timers[L2LTimer].tic(); + if(operationsToProceed & FFmmL2L) downardPass(); + Timers[L2LTimer].tac(); #ifdef SCALFMM_TRACE_ALGO - eztrace_leave_event(); - eztrace_enter_event("L2P+P2P", EZTRACE_BLUE); + eztrace_leave_event(); + eztrace_enter_event("L2P+P2P", EZTRACE_BLUE); #endif - Timers[NearTimer].tic(); - if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P)); - Timers[NearTimer].tac(); + Timers[NearTimer].tic(); + if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P)); + Timers[NearTimer].tac(); #ifdef SCALFMM_TRACE_ALGO - eztrace_leave_event(); - eztrace_stop(); + eztrace_leave_event(); + eztrace_stop(); #endif - // delete array - delete [] iterArray; - delete [] iterArrayComm; - iterArray = nullptr; - iterArrayComm = nullptr; + // delete array + delete [] iterArray; + delete [] iterArrayComm; + iterArray = nullptr; + iterArrayComm = nullptr; #ifdef SCALFMM_TRACE_ALGO - eztrace_stop(); + eztrace_stop(); #endif + } + else{ + FLOG( FLog::Controller << "\tProcess = " << comm.processId() << " has zero particles.\n" ); + } } ///////////////////////////////////////////////////////////////////////////// @@ -455,11 +483,11 @@ protected: } FLOG(parallelCounter.tic()); - #pragma omp parallel num_threads(MaxThreads) +#pragma omp parallel num_threads(MaxThreads) { KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]); //This single section post and receive the comms, and then do the M2M associated with it. - #pragma omp single nowait +#pragma omp single nowait { FLOG(singleCounter.tic()); // Master proc never send @@ -491,10 +519,10 @@ protected: // Post the message bufferSize = sendBuffer.getSize(); MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo, - FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]); + FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]); FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max()); MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo, - FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]); + FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]); } } @@ -509,7 +537,7 @@ protected: && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){ if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){ MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]), - idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]); + idProcSource, FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]); nbProcThatSendToMe += 1; FAssertLF(nbProcThatSendToMe <= 7); } @@ -535,7 +563,7 @@ protected: recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]); FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max()); MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE, - idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]); + idProcSource, FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]); nbProcThatSendToMe += 1; FAssertLF(nbProcThatSendToMe <= 7); } @@ -557,11 +585,11 @@ protected: memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*)); // Retreive data and merge my child and the child from others + int positionToInsert = 0; for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){ unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>()); int position = 0; - int positionToInsert = 0; while( packageFlags && position < 8){ while(!(packageFlags & 0x1)){ packageFlags >>= 1; @@ -590,7 +618,7 @@ protected: }//End Of Single section // All threads proceed the M2M - #pragma omp for nowait schedule(dynamic, userChunkSize) +#pragma omp for nowait schedule(dynamic, userChunkSize) for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){ myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); } @@ -645,9 +673,9 @@ protected: FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight]; memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight); - #pragma omp parallel num_threads(MaxThreads) +#pragma omp parallel num_threads(MaxThreads) { - #pragma omp master +#pragma omp master { { FLOG(prepareCounter.tic()); @@ -747,7 +775,7 @@ protected: ////////////////////////////////////////////////////////////////// FLOG(gatherCounter.tic()); - FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()), __LINE__ ); + FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, fcomCompute.getComm()), __LINE__ ); FLOG(gatherCounter.tac()); ////////////////////////////////////////////////////////////////// @@ -780,7 +808,7 @@ protected: FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(), sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc, - FMpi::TagLast + idxLevel*100, comm, &requests); + FMpi::TagLast + idxLevel*100, fcomCompute, &requests); } const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; @@ -789,7 +817,7 @@ protected: FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(), recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc, - FMpi::TagLast + idxLevel*100, comm, &requests); + FMpi::TagLast + idxLevel*100, fcomCompute, &requests); } } } @@ -808,7 +836,7 @@ protected: // Do M2L ////////////////////////////////////////////////////////////////// - #pragma omp single nowait +#pragma omp single nowait { typename OctreeClass::Iterator octreeIterator(tree); octreeIterator.moveDown(); @@ -845,7 +873,7 @@ protected: { const int chunckSize = userChunkSize; for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){ - #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize) +#pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize) { KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; const CellClass* neighbors[342]; @@ -860,15 +888,15 @@ protected: } }//End of task spawning - #pragma omp taskwait +#pragma omp taskwait for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){ - #pragma omp task default(none) firstprivate(idxThread,idxLevel) +#pragma omp task default(none) firstprivate(idxThread,idxLevel) { kernels[idxThread]->finishedLevelM2L(idxLevel); } } - #pragma omp taskwait +#pragma omp taskwait FLOG(computationCounter.tac()); } @@ -945,7 +973,7 @@ protected: // Compute this cells FLOG(computationCounter.tic()); - #pragma omp parallel num_threads(MaxThreads) +#pragma omp parallel num_threads(MaxThreads) { KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; MortonIndex neighborsIndex[/*189+26+1*/216]; @@ -953,7 +981,7 @@ protected: const CellClass* neighbors[342]; int neighborPositions[342]; - #pragma omp for schedule(dynamic, userChunkSize) nowait +#pragma omp for schedule(dynamic, userChunkSize) nowait for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ // compute indexes const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition, separationCriteria); @@ -1079,10 +1107,10 @@ protected: } } - #pragma omp parallel num_threads(MaxThreads) +#pragma omp parallel num_threads(MaxThreads) { KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]); - #pragma omp single nowait +#pragma omp single nowait { FLOG(prepareCounter.tic()); int iterRequests = 0; @@ -1092,7 +1120,7 @@ protected: // Post the receive if(hasToReceive){ FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive, - FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ ); + FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ ); } // We have to be sure that we are not sending if we have no work in the current level @@ -1112,10 +1140,10 @@ protected: } // Post the send message FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend, - FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__); + FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__); FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max()); FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, idxProcSend, - FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__); + FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__); // Inc and check the counter nbMessageSent += 1; FAssertLF(nbMessageSent <= 7); @@ -1129,7 +1157,7 @@ protected: // Finalize the communication if(iterRequestsSize){ FLOG(waitCounter.tic()); - FAssertLF(iterRequestsSize < 8); + FAssertLF(iterRequestsSize <= 8); FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, MPI_STATUSES_IGNORE), __LINE__); FLOG(waitCounter.tac()); } @@ -1138,12 +1166,12 @@ protected: recvBuffer.cleanAndResize(recvBufferSize); FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max()); FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_BYTE, idxProcToReceive, - FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ ); + FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__ ); } if(iterRequests){ FLOG(waitCounter.tic()); - FAssertLF(iterRequests < 8); + FAssertLF(iterRequests <= 8); FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE), __LINE__); FLOG(waitCounter.tac()); } @@ -1159,12 +1187,12 @@ protected: FLOG(prepareCounter.tac()); } - #pragma omp single nowait +#pragma omp single nowait { FLOG(computationCounter.tic()); } // Threads are working on all the cell of our working interval at that level - #pragma omp for nowait schedule(dynamic, userChunkSize) +#pragma omp for nowait schedule(dynamic, userChunkSize) for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){ myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); } @@ -1310,7 +1338,7 @@ protected: //Share to all processus globalReceiveMap FLOG(gatherCounter.tic()); FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend), - globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), comm.getComm()), __LINE__ ); + globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), fcomCompute.getComm()), __LINE__ ); FLOG(gatherCounter.tac()); FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess]; @@ -1329,7 +1357,7 @@ protected: recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]); FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(), - idxProc, FMpi::TagFmmP2P, comm, &requests); + idxProc, FMpi::TagFmmP2P, fcomCompute, &requests); } } @@ -1347,7 +1375,7 @@ protected: FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]); FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(), - idxProc, FMpi::TagFmmP2P, comm, &requests); + idxProc, FMpi::TagFmmP2P, fcomCompute, &requests); } } @@ -1380,7 +1408,7 @@ protected: delete recvBuffer[idxProc]; recvBuffer[idxProc] = nullptr; } - } + } for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ delete sendBuffer[idxProc]; @@ -1472,7 +1500,7 @@ protected: // need the current particles and neighbors particles const int counter = tree->getLeafsNeighbors(neighbors, neighborPositions, currentIter.coord, OctreeHeight-1); myThreadkernels->P2P( currentIter.coord,currentIter.targets, - currentIter.sources, neighbors, neighborPositions, counter); + currentIter.sources, neighbors, neighborPositions, counter); } } } @@ -1528,7 +1556,7 @@ protected: } if(counter){ myThreadkernels.P2PRemote( currentIter.cell->getCoordinate(), currentIter.targets, - currentIter.sources, neighbors, neighborPositions, counter); + currentIter.sources, neighbors, neighborPositions, counter); } } diff --git a/Src/Files/FMpiTreeBuilder.hpp b/Src/Files/FMpiTreeBuilder.hpp index 95ed92bb00982bc3c6cd1ba978b78290c9aceb6e..fc449ea4038ca5203eb773ae545240b20fb42dcd 100644 --- a/Src/Files/FMpiTreeBuilder.hpp +++ b/Src/Files/FMpiTreeBuilder.hpp @@ -162,7 +162,7 @@ public: // To merge the leaves ////////////////////////////////////////////////////////////////////////// - static void MergeSplitedLeaves(const FMpi::FComm& communicator, IndexedParticle* workingArray, FSize* workingSize, + static void MergeSplitedLeaves(const FMpi::FComm& communicator, IndexedParticle** workingArray, FSize* workingSize, FSize ** leavesOffsetInParticles, ParticleClass** particlesArrayInLeafOrder, FSize* const leavesSize){ const int myRank = communicator.processId(); const int nbProcs = communicator.processCount(); @@ -171,13 +171,13 @@ public: { // Get the information of the leaves leavesInfo.clear(); if((*workingSize)){ - leavesInfo.push({workingArray[0].index, 1, 0}); + leavesInfo.push({(*workingArray)[0].index, 1, 0}); for(FSize idxPart = 1 ; idxPart < (*workingSize) ; ++idxPart){ - if(leavesInfo.data()[leavesInfo.getSize()-1].mindex == workingArray[idxPart].index){ + if(leavesInfo.data()[leavesInfo.getSize()-1].mindex == (*workingArray)[idxPart].index){ leavesInfo.data()[leavesInfo.getSize()-1].nbParts += 1; } else{ - leavesInfo.push({workingArray[idxPart].index, 1, idxPart}); + leavesInfo.push({(*workingArray)[idxPart].index, 1, idxPart}); } } } @@ -209,12 +209,16 @@ public: while(0 < idProcToSendTo && (allProcFirstLeafStates[(idProcToSendTo-1)*2 + 1].mindex == borderLeavesState[0].mindex || allProcFirstLeafStates[(idProcToSendTo-1)*2 + 1].mindex == noDataFlag)){ + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] idProcToSendTo " + << idProcToSendTo << " allProcFirstLeafStates[(idProcToSendTo-1)*2 + 1].mindex " << + allProcFirstLeafStates[(idProcToSendTo-1)*2 + 1].mindex << " borderLeavesState[0].mindex " << + borderLeavesState[0].mindex << "\n"; FLog::Controller.flush(); ); idProcToSendTo -= 1; } // We found someone if(idProcToSendTo != myRank && allProcFirstLeafStates[(idProcToSendTo)*2 + 1].mindex == borderLeavesState[0].mindex){ // Post and send message for the first leaf - FMpi::ISendSplit(&workingArray[0], borderLeavesState[0].nbParts, idProcToSendTo, + FMpi::ISendSplit(&(*workingArray)[0], borderLeavesState[0].nbParts, idProcToSendTo, FMpi::TagExchangeIndexs, communicator, &requests); FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] send " << borderLeavesState[0].nbParts << " to " << idProcToSendTo << "\n"; FLog::Controller.flush(); ); hasSentFirstLeaf = true; @@ -228,11 +232,18 @@ public: // Count all the particle of our first leaf on other procs FSize totalNbParticlesToRecv = 0; int idProcToRecvFrom = myRank; - while(idProcToRecvFrom+1 < nbProcs && - (borderLeavesState[1].mindex == allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex - || allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex == noDataFlag)){ - idProcToRecvFrom += 1; - totalNbParticlesToRecv += allProcFirstLeafStates[(idProcToRecvFrom)*2].nbParts; + if(!hasSentFirstLeaf || borderLeavesState[0].mindex != borderLeavesState[1].mindex){ + while(idProcToRecvFrom+1 < nbProcs && + (borderLeavesState[1].mindex == allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex + || allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex == noDataFlag)){ + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] idProcToRecvFrom " + << idProcToRecvFrom << " allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex " << + allProcFirstLeafStates[(idProcToRecvFrom+1)*2].mindex << " borderLeavesState[1].mindex " << + borderLeavesState[1].mindex << "\n"; FLog::Controller.flush(); ); + + idProcToRecvFrom += 1; + totalNbParticlesToRecv += allProcFirstLeafStates[(idProcToRecvFrom)*2].nbParts; + } } // If there are some if(totalNbParticlesToRecv){ @@ -262,7 +273,7 @@ public: const FSize offsetParticles = borderLeavesState[0].nbParts; // Move all the particles for(FSize idxPart = offsetParticles ; idxPart < (*workingSize) ; ++idxPart){ - workingArray[idxPart - offsetParticles] = workingArray[idxPart]; + (*workingArray)[idxPart - offsetParticles] = (*workingArray)[idxPart]; } // Move all the leaf for(int idxLeaf = 1 ; idxLeaf < leavesInfo.getSize() ; ++idxLeaf){ @@ -276,14 +287,16 @@ public: if(hasExtendLastLeaf){ // Allocate array const FSize finalParticlesNumber = (*workingSize) + receivedParticles.size(); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Create array " + << finalParticlesNumber << " particles\n"; FLog::Controller.flush(); ); IndexedParticle* particlesWithExtension = new IndexedParticle[finalParticlesNumber]; // Copy old data - memcpy(particlesWithExtension, workingArray, (*workingSize)*sizeof(IndexedParticle)); + memcpy(particlesWithExtension, (*workingArray), (*workingSize)*sizeof(IndexedParticle)); // Copy received data memcpy(particlesWithExtension + (*workingSize), receivedParticles.data(), receivedParticles.size()*sizeof(IndexedParticle)); // Move ptr - delete[] workingArray; - workingArray = particlesWithExtension; + delete[] (*workingArray); + (*workingArray) = particlesWithExtension; (*workingSize) = finalParticlesNumber; leavesInfo[leavesInfo.getSize()-1].nbParts += receivedParticles.size(); } @@ -298,7 +311,7 @@ public: //Copy all the particles (*particlesArrayInLeafOrder) = new ParticleClass[(*workingSize)]; for(FSize idxPart = 0 ; idxPart < (*workingSize) ; ++idxPart){ - memcpy(&(*particlesArrayInLeafOrder)[idxPart],&workingArray[idxPart].particle,sizeof(ParticleClass)); + memcpy(&(*particlesArrayInLeafOrder)[idxPart],&(*workingArray)[idxPart].particle,sizeof(ParticleClass)); } // Assign the number of leaf (*leavesSize) = leavesInfo.getSize(); @@ -363,8 +376,9 @@ public: const std::vector<FEqualize::Package> packsToSend = FEqualize::GetPackToSend(myCurrentInter, allObjectives); FAssertLF((currentNbLeaves == 0 && packsToSend.size() == 0) || - (packsToSend.size() && FSize(packsToSend[packsToSend.size()-1].elementTo) == currentNbLeaves)); + (currentNbLeaves != 0 && packsToSend.size() && FSize(packsToSend[packsToSend.size()-1].elementTo) == currentNbLeaves)); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Previous currentNbLeaves (" << currentNbLeaves << ")\n"; FLog::Controller.flush(); ); FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Get my interval (" << packsToSend.size() << ")\n"; FLog::Controller.flush(); ); FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << communicator.processId() << "] Send data\n"; FLog::Controller.flush(); ); @@ -379,6 +393,8 @@ public: const FEqualize::Package& pack = packsToSend[idxPack]; if(idxPack != 0) FAssertLF(packsToSend[idxPack].elementFrom == packsToSend[idxPack-1].elementTo); + FAssertLF(FSize(pack.elementTo) <= FSize(currentNbParts)); + FAssertLF(pack.elementFrom <= pack.elementTo); const long long int nbPartsPerPackToSend = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom]; totalSend += nbPartsPerPackToSend; @@ -388,7 +404,7 @@ public: << " from " << pack.elementFrom << " to " << pack.elementTo << " \n"; FLog::Controller.flush(); ); // Send the size of the data requestsNbParts.emplace_back(); - FMpi::MpiAssert(MPI_Isend(&nbPartsPerPackToSend,1,MPI_LONG_LONG_INT,pack.idProc, + FMpi::MpiAssert(MPI_Isend(const_cast<long long int*>(&nbPartsPerPackToSend),1,MPI_LONG_LONG_INT,pack.idProc, FMpi::TagExchangeIndexs, communicator.getComm(), &requestsNbParts.back()),__LINE__); } @@ -552,7 +568,9 @@ public: // From ParticleClass get array of IndexedParticle sorted GetSortedParticlesFromArray(communicator, originalParticlesArray, originalNbParticles, sortingType, boxCenter, boxWidth, treeHeight, &sortedParticlesArray, &nbParticlesInArray); - FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); ); + FLOG( FLog::Controller << "[" << communicator.processId() << "] Particles Distribution: " + << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s) " + << nbParticlesInArray << " particles\n"; FLog::Controller.flush(); ); FLOG( timer.tic() ); // for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){ @@ -563,7 +581,7 @@ public: FSize * leavesOffsetInParticles = nullptr; FSize nbLeaves = 0; // Merge the leaves - MergeSplitedLeaves(communicator, sortedParticlesArray, &nbParticlesInArray, &leavesOffsetInParticles, &particlesArrayInLeafOrder, &nbLeaves); + MergeSplitedLeaves(communicator, &sortedParticlesArray, &nbParticlesInArray, &leavesOffsetInParticles, &particlesArrayInLeafOrder, &nbLeaves); delete[] sortedParticlesArray; // for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){ diff --git a/Src/Utils/FMpi.hpp b/Src/Utils/FMpi.hpp index f0654ea8ea223c919ca278774fecc095be29bf9e..c299a0fc6088dc6de6830c41ee35452b1cc5b85f 100644 --- a/Src/Utils/FMpi.hpp +++ b/Src/Utils/FMpi.hpp @@ -107,7 +107,7 @@ public: * This class is used to gather the usual methods related to identifying an * MPI communicator. */ - class FComm : public FNoCopyable { + class FComm { int rank; ///< rank related to the comm int nbProc; ///< nb proc in this group @@ -130,6 +130,26 @@ public: reset(); } + /// Constructor : duplicates the given communicator + FComm(const FComm& inCommunicator ) { + FMpi::Assert( MPI_Comm_dup(inCommunicator.communicator, &communicator), __LINE__ , "comm dup"); + FMpi::Assert( MPI_Comm_group(communicator, &group), __LINE__ , "comm group"); + + reset(); + } + + FComm& operator=(const FComm& inCommunicator ) { + FMpi::Assert( MPI_Comm_free(&communicator), __LINE__ ); + FMpi::Assert( MPI_Group_free(&group), __LINE__ ); + + FMpi::Assert( MPI_Comm_dup(inCommunicator.communicator, &communicator), __LINE__ , "comm dup"); + FMpi::Assert( MPI_Comm_group(communicator, &group), __LINE__ , "comm group"); + + reset(); + + return *this; + } + /// Frees communicator and group virtual ~FComm(){ FMpi::Assert( MPI_Comm_free(&communicator), __LINE__ ); @@ -248,6 +268,35 @@ public: delete[] procsIdArray ; } + /** Change the group, create one groupd where processInGroup[i] != 0 + * and another where processInGroup[i] == 0 + */ + void groupReduce(const int processInGroup[]){ + int * procsIdArray = new int [nbProc]; + int counterNewGroup = 0; + for(int idxProc = 0 ;idxProc < nbProc ; ++idxProc){ + if(processInGroup[rank] && processInGroup[idxProc]){ + procsIdArray[counterNewGroup++] = idxProc; + } + else if(!processInGroup[rank] && !processInGroup[idxProc]){ + procsIdArray[counterNewGroup++] = idxProc; + } + } + + MPI_Group previousGroup = group; + FMpi::Assert( MPI_Group_incl(previousGroup, counterNewGroup , procsIdArray, &group), __LINE__ ); + + MPI_Comm previousComm = communicator; + FMpi::Assert( MPI_Comm_create(previousComm, group, &communicator), __LINE__ ); + + MPI_Comm_free(&previousComm); + MPI_Group_free(&previousGroup); + + reset(); + FAssertLF(nbProc == counterNewGroup); + delete[] procsIdArray ; + } + void barrier() const { FMpi::Assert(MPI_Barrier(getComm()), __LINE__); } diff --git a/Src/Utils/FQuickSortMpi.hpp b/Src/Utils/FQuickSortMpi.hpp index 437d2ede4e2f81538ffb9ecdef515f88be01b197..eba5fdfff4bc1298bf76ef6e7cf1e7c72d354e36 100644 --- a/Src/Utils/FQuickSortMpi.hpp +++ b/Src/Utils/FQuickSortMpi.hpp @@ -93,14 +93,18 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { { // Get the number of elements each proc should recv IndexType totalRemainingElements = totalElements; + IndexType totalAvailableElements = totalElementToProceed; + for(int idxProc = firstProcToRecv; idxProc < lastProcToRecv ; ++idxProc){ const IndexType nbElementsAlreadyOwned = (inFromRightToLeft ? globalElementBalance[idxProc].lowerPart : globalElementBalance[idxProc].greaterPart); const IndexType averageNbElementForRemainingProc = (totalRemainingElements)/(lastProcToRecv-idxProc); totalRemainingElements -= nbElementsAlreadyOwned; FAssertLF(totalRemainingElements >= 0); - if(nbElementsAlreadyOwned < averageNbElementForRemainingProc){ - nbElementsToRecvPerProc[idxProc - firstProcToRecv] = (averageNbElementForRemainingProc - nbElementsAlreadyOwned); + if(nbElementsAlreadyOwned < averageNbElementForRemainingProc && totalAvailableElements){ + nbElementsToRecvPerProc[idxProc - firstProcToRecv] = FMath::Min(totalAvailableElements, + averageNbElementForRemainingProc - nbElementsAlreadyOwned); + totalAvailableElements -= nbElementsToRecvPerProc[idxProc - firstProcToRecv]; totalRemainingElements -= nbElementsToRecvPerProc[idxProc - firstProcToRecv]; } else{ @@ -308,6 +312,10 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> { counterValuesInPivot += 1; } } + if(counterValuesInPivot <= 1){ + (*shouldStop) = true; + return globalPivot; + } (*shouldStop) = false; return globalPivot/counterValuesInPivot; } @@ -336,7 +344,7 @@ public: break; } - FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalPivot = " << globalPivot << "\n" ); + FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] globalPivot = " << globalPivot << " for " << currentComm.processCount() << "\n" ); FLOG(if(VerboseLog) FLog::Controller.flush()); // Split the array in two parts lower equal to pivot and greater than pivot diff --git a/Tests/Utils/testPartitionsMapping.cpp b/Tests/Utils/testPartitionsMapping.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b7f5c1bce67ae1b86b38253a3f94caef428d48a4 --- /dev/null +++ b/Tests/Utils/testPartitionsMapping.cpp @@ -0,0 +1,149 @@ +// =================================================================================== +// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas +// olivier.coulaud@inria.fr, berenger.bramas@inria.fr +// This software is a computer program whose purpose is to compute the FMM. +// +// This software is governed by the CeCILL-C and LGPL licenses and +// abiding by the rules of distribution of free software. +// +// This program is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +// GNU General Public and CeCILL-C Licenses for more details. +// "http://www.cecill.info". +// "http://www.gnu.org/licenses". +// =================================================================================== + +// ==== CMAKE ===== +// @FUSE_MPI +// ================ + +#include <iostream> + +#include <cstdio> +#include <cstdlib> + + +#include "../../Src/Kernels/Rotation/FRotationCell.hpp" +#include "../../Src/Kernels/Rotation/FRotationKernel.hpp" + +#include "../../Src/Components/FSimpleLeaf.hpp" +#include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp" + +#include "../../Src/Utils/FParameters.hpp" +#include "../../Src/Utils/FMemUtils.hpp" +#include "../../Src/BalanceTree/FPartitionsMapping.hpp" + +#include "../../Src/Containers/FOctree.hpp" +#include "../../Src/Containers/FVector.hpp" + +#include "../../Src/Files/FRandomLoader.hpp" +#include "../../Src/Files/FMpiTreeBuilder.hpp" + +#include "../../Src/Core/FFmmAlgorithm.hpp" +#include "../../Src/Core/FFmmAlgorithmThread.hpp" +#include "../../Src/Core/FFmmAlgorithmThreadProc.hpp" + +#include "../../Src/BalanceTree/FLeafBalance.hpp" + +#include "../../Src/Utils/FParameterNames.hpp" + +/** + * This program runs the FMM Algorithm Distributed with the Rotation kernel + */ + +// Simply create particles and try the kernels +int main(int argc, char* argv[]) +{ + FHelpDescribeAndExit(argc, argv, + "Test with MPI the chebyshev FMM and compare it to the direct computation for debugging purpose.", + FParameterDefinitions::NbParticles, FParameterDefinitions::OctreeHeight, + FParameterDefinitions::OctreeSubHeight, FParameterDefinitions::NbThreads); + + typedef double FReal; + + FMpi app(argc,argv); + + const FSize nbParticles = FParameters::getValue(argc,argv, FParameterDefinitions::NbParticles.options, 10000000ULL); + const unsigned int TreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5); + FTic time; + + std::cout << ">> This executable has to be used to test Proc Rotation Algorithm. \n"; + + // init particles position and physical value + struct TestParticle{ + FPoint<FReal> position; + const FPoint<FReal>& getPosition(){ + return position; + } + }; + + // open particle file + std::cout << "Creating : " << nbParticles << "\n" << std::endl; + FRandomLoader<FReal> loader(nbParticles, 1.0, FPoint<FReal>(0,0,0), app.global().processId()); + + time.tic(); + std::unique_ptr<TestParticle[]> particles(new TestParticle[loader.getNumberOfParticles()]); + for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){ + loader.fillParticle(&particles[idxPart].position); + } + + FPartitionsMapping<FReal> map(app.global()); + + FVector<FPartitionsMapping<double>::TestParticle<0> > finalParticles = map.distributeParticles<0>(loader.getNumberOfParticles(), + loader.getCenterOfBox(), + loader.getBoxWidth(), TreeHeight, + [&](const int idx, FPoint<FReal>* position, std::array<FReal, 0>* /*val*/){ + position->setPosition(particles[idx].position.getX(), + particles[idx].position.getY(), + particles[idx].position.getZ()); + }); + + // Test every particles exist + { + std::unique_ptr<int[]> count(new int[app.global().processCount() * nbParticles]); + memset(count.get(), 0, sizeof(int) * app.global().processCount() * nbParticles); + for(FSize part = 0 ; part < finalParticles.getSize() ; ++part){ + const FSize idx = finalParticles[part].initialProcOwner*nbParticles + finalParticles[part].localIndex; + FAssertLF(count[idx] == 0) + count[idx] = 1; + } + + FMpi::Assert( MPI_Allreduce(MPI_IN_PLACE, count.get(), app.global().processCount()*nbParticles, MPI_INT, MPI_SUM, + app.global().getComm()), __LINE__); + + for(FSize part = 0 ; part < app.global().processCount()*nbParticles ; ++part){ + FAssertLF(count[part] == 1); + } + } + + // Test to send data + { + std::unique_ptr<std::array<FReal, 2>[]> toDistr(new std::array<FReal, 2>[nbParticles]); + for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){ + toDistr[idxPart][0] = app.global().processId(); + toDistr[idxPart][1] = FReal(idxPart); + } + + std::unique_ptr<std::array<FReal, 2>[]> res = map.distributeData<2>(toDistr.get()); + + for(FSize part = 0 ; part < finalParticles.getSize() ; ++part){ + if(int(res[part][0]) != finalParticles[part].initialProcOwner) + std::cout << "[" << app.global().processId() << "]Res proc is " << res[part][0] << " should be " << finalParticles[part].initialProcOwner << std::endl; + if(int(res[part][1]) != finalParticles[part].localIndex) + std::cout << "[" << app.global().processId() << "]Res localidx is " << res[part][1] << " should be " << finalParticles[part].localIndex << std::endl; + } + + std::unique_ptr<std::array<FReal, 2>[]> resBack = map.getResultingData<2>(res.get()); + + for(FSize part = 0 ; part < loader.getNumberOfParticles() ; ++part){ + if(int(resBack[part][0]) != app.global().processId()) + std::cout << "[" << app.global().processId() << "]ResBack proc is " << resBack[part][0] << " should be " << app.global().processId() << std::endl; + if(int(resBack[part][1]) != part) + std::cout << "[" << app.global().processId() << "]ResBack localidx is " << resBack[part][1] << " should be " << part << std::endl; + } + } + + return 0; +} +