diff --git a/Doc/Src_tex/ParallelDetails.pdf b/Doc/Src_tex/ParallelDetails.pdf index 6ad670aee7216bbe3450fcd1b4d235cbc2e443a2..9c46298ee0600395a6bb9b6433c352b844ca0b16 100755 Binary files a/Doc/Src_tex/ParallelDetails.pdf and b/Doc/Src_tex/ParallelDetails.pdf differ diff --git a/Doc/Src_tex/ParallelDetails.tex b/Doc/Src_tex/ParallelDetails.tex index 1a81839a75c84eb89f83fbe16903f225635c5edc..070d9a4c8d3a84547709d325af7c4e7ab59e60aa 100755 --- a/Doc/Src_tex/ParallelDetails.tex +++ b/Doc/Src_tex/ParallelDetails.tex @@ -3,11 +3,12 @@ \usepackage{listings} \usepackage{geometry} \usepackage{graphicx} +\usepackage{tikz} \usepackage[hypertexnames=false, pdftex]{hyperref} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% % use:$ pdflatex ParallelDetails.tex %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% -\author{Berenger Bramas, Olivier Coulaud, Cyrille Piacibelo} +\author{Berenger Bramas, Olivier Coulaud, Cyrille Piacibello} \title{ScalFmm - Parallel Algorithms (Draft)} \date{\today} @@ -152,26 +153,64 @@ That is the reason why we need to balance the data among nodes. %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{Balancing the leaves} -After sorting, each process has potentially several leaves. -If we have two processes $P_{i}$ and $P_{j}$ with $i < j$ the sort guarantees that all leaves from node i are inferior than the leaves on the node j in a Morton indexing way. -But the leaves are randomly distributed among the nodes and we need to balance them. -It is a simple reordoring of the data, but the data has to stayed sorted. +After sorting, each process has potentially several leaves. If we +have two processes $P_{i}$ and $P_{j}$ with $i < j$ the sort +guarantees that all leaves from node i are inferior than the leaves on +the node j in a Morton indexing way. But the leaves are randomly +distributed among the nodes and we need to balance them. It is a +simple reordoring of the data, but the data has to stayed sorted. \begin{enumerate} \item Each process informs other to tell how many leaves it holds. -\item Each process compute how many leaves it has to send or to receive from left or right. +\item Each process compute how many leaves it has to send or to + receive from left or right. \label{balRef} \end{enumerate} -At the end of the algorithm our system is completely balanced with the same number of leaves on each process. + +At the end of the algorithm our system is completely balanced with the +same number of leaves on each process. If another kind of balancing +algorithm is needed, one can only change the BalanceAlgorithm class +that is given in parameter to the ArrayToTree static method in the +step \ref{balRef}. + +\subsection{Balancing algorithms supported} +Any balancing algorithm can be used, but it has to provide at least +two method, as showed in the class FAbstractBalancingAlgorithm. +Those methods are : +\begin{enumerate} +\item GetLeft : return the number of leaves that will belongs only to + proc on the left of given proc. +\item GetRight : return the number of leaves that will belongs only to + proc on the right of given proc. +\end{enumerate} + +In the parameters of those two methods, one can find the total number +of leaves, the total number of particles, the number of proc, and the +index of a proc to be treated. \begin{figure}[h!] \begin{center} \includegraphics[width=15cm, height=15cm, keepaspectratio=true]{Images/Balance.png} - \caption{Balancing Example} + \caption{Balancing Example : A process has to send data to the + left if its current left limit is upper than its objective + limit. Same in the other side, and we can reverse the calculs + to know if a process has to received data.} \end{center} \end{figure} -A process has to send data to the left if its current left limit is upper than its objective limit. -Same in the other side, and we can reverse the calculs to know if a process has to received data. +\clearpage +\subsection{Mpi calls} +Once every process know exactly what it needs to compute for itself +and for any other proc the bound GetRight() and GetLeft(), there is +only one Mpi communication AllToAll. + +To prepare the buffers to be sent and received, each proc count the +number of leafs (and the size) it holds, and divide them into +potentially three parts : +\begin{enumerate} +\item The datas to send to proc on left (Can be null). +\item The datas to keep (can be null). +\item The datas to send to proc on right(can be null). +\end{enumerate} %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -237,10 +276,10 @@ shared cell at a level. There are to cases : \begin{itemize} - \item My first cell is shared means that I need to send the children I have of - this cell to the processus on my left. - \item My last cell is shared means that I need to receive some - children from the processus on my right. +\item My first cell is shared means that I need to send the children I have of + this cell to the processus on my left. +\item My last cell is shared means that I need to receive some + children from the processus on my right. \end{itemize} @@ -313,6 +352,46 @@ Example : \hline \end{tabular} +\subsection{Modified M2M} +The algorithm may not be efficient for special cases. Since the +communications do not progress (even in asynchrone way) while +computing the M2M, the algorithm has been modified, in order to set +one of the OMP thread to the communications. + +\begin{algorithm}[H] + \RestyleAlgo{boxed} + \LinesNumbered + \SetAlgoLined + \KwData{none} + \KwResult{none} + \BlankLine + \For{idxLevel $\leftarrow$ $Height - 2$ \KwTo 1}{ + \tcp{pragma omp single} + \Begin(To be done by one thread only){ + \uIf{$cells[0]$ not in my working interval}{ + isend($cells[0].child$)\; + hasSend $\leftarrow$ true\; + } + \uIf{$cells[end]$ in another working interval}{ + irecv(recvBuffer)\; + hasRecv $\leftarrow$ true\; + } + \emph{Wait send and recv if needed}\; + \uIf{hasRecv is true}{ + M2M($cells[end]$, recvBuffer)\; + } + } + \tcp{pragma omp for} + \Begin(To be done by all the other threads){ + \ForAll{Cell c at level idxLevel in working interval}{ + M2M(c, c.child)\; + } + } + } + \BlankLine + \caption{Distributed M2M} +\end{algorithm} + \clearpage %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% @@ -413,13 +492,25 @@ Finally they send and receive data in an asynchronous way and cover it by the P2 \BlankLine \caption{Distributed P2P} \end{algorithm} + +\subsection{Shared Memory Version} +The P2P algorithm is computed once for each pair of leafs belonging to +the same proc. This means that when a proc will compute the force on +the particles of leaf $1$ due to the particles of leaf $2$, both leafs +$1$ and $2$ will be updated. + +This way of compute the interaction is faster, but leads to +concurrency problems. + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \section{M2L} The M2L operator is relatively similar to the P2P. -Hence P2P is done at the leaves level, M2L is done on several levels from Height - 2 to 2. +Hence P2P is done at the leaves level, M2L is done on several levels from $Height - 2$ to 2. At each level, a node needs to have access to all the distant neighbors of the cells it is the proprietary and those ones can be hosted by any other node. Anyway, each node can compute a part of the M2L with the data it has. +\subsection{Original Algorithm} The algorithm can be viewed as several tasks: \begin{enumerate} \item Compute to know what data has to be sent @@ -436,7 +527,7 @@ The algorithm can be viewed as several tasks: \KwData{none} \KwResult{none} \BlankLine - \ForAll{Level idxLeve from 2 to Height - 2}{ + \ForAll{Level idxLevel from 2 to Height - 2}{ \ForAll{Cell c at level idxLevel}{ neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\; \ForAll{index in neighborsIndexes}{ @@ -462,6 +553,103 @@ The algorithm can be viewed as several tasks: \BlankLine \caption{Distributed M2L} \end{algorithm} + +\subsection{Algorithm Modified} +The idea in the following version is to cover the communications +between process with the work (M2L Self) that can be done without +anything from outside. + +\begin{algorithm}[H] + \RestyleAlgo{boxed} + \LinesNumbered + \SetAlgoLined + \KwData{none} + \KwResult{none} + \BlankLine + \Begin(To be done by one thread only){ \label{single} + \ForAll{Level idxLevel from 2 to Height - 2}{ + \ForAll{Cell c at level idxLevel}{ + neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\; + \ForAll{index in neighborsIndexes}{ + \uIf{index belong to another proc}{ + isend(c)\; + \emph{Mark c as a cell that is linked to another proc}\; + } + } + } + } + \emph{Wait send and recv if needed}\; + } + \Begin(To be done by everybody else){\label{multiple} + \emph{Normal M2L}\; + } + \ForAll{Cell c received}{ + $lightOctree.insert( c )$\; + } + \ForAll{Level idxLeve from 2 to Height - 1}{ + \ForAll{Cell c at level idxLevel that are marked}{ + neighborsIndexes $\leftarrow$ $c.potentialDistantNeighbors()$\; + neighbors $\leftarrow$ lightOctree.get(neighborsIndexes)\; + M2L( c, neighbors)\; + } + } + \BlankLine + \caption{Distributed M2L 2} + +\end{algorithm} + + +\appendix + +\clearpage +\chapter{Cheat sheet about using EZtrace with ViTE on ScalFMM} + +In this appendix, one can find usefull information about using EZtrace +on ScalFMM, and visualisation with ViTE. + +\section{EZtrace} +EZTrace is a tool that aims at generating automatically execution +trace from HPC (High Performance Computing) programs. + +It does not need any source instrumentation. +Usefull variables : + +\begin{itemize} +\item EZTRACE\_FLUSH : set the value to $1$ in order to flush the + event buffer to the disk in case of uge amouts of datas. +\item EZTRACE\_TRACE : choice of the type of event one wants to + have. Example : EZTRACE\_TRACE="mpi stdio omp memory". Remark : Mpi do a + lot of call to pthread, so I suggest to not trace pthread events in + order to visualize the results. +\item EZTRACE\_TRACE\_DIR : path to a directory in wich eztrace will + store trace for each MPI Proc. (Set to /lustre/username/smt to avoid + overhead) +\end{itemize} + +Once the traces are generated, one need to convert them, in order to +visualize its. + +\section{ViTE} +ViTE is a high memory consumption software, so in order to use it, +avoid tracing pthread for example. + +One can zoom in and out in the gant chart. + +Plugin : One can get a histogram of each proc display the percentage +of time spend in different section. +\begin{itemize} + \item Got to Preferences $\rightarrow$ Plugin. +\end{itemize} + +Sorting the gant charts : Sometimes the process are badly sorted (like +0,1,10,11,2,3,4,5,6,7,8,9). It's possible to sort them with the mouse, +or with editing an xml file : +\begin{itemize} + \item Got to Preferences $\rightarrow$ Node Selection and then Sort, + or export/load xml file . +\end{itemize} + + %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% %%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%% \begin{thebibliography}{9} diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp old mode 100755 new mode 100644 index d5c13e16787e488447cbe784e5604932553c54a9..c9da61c8828b85954038c99a030df2d398b20782 --- a/Src/Core/FFmmAlgorithmThreadProc.hpp +++ b/Src/Core/FFmmAlgorithmThreadProc.hpp @@ -23,6 +23,7 @@ #include "../Utils/FLog.hpp" #include "../Utils/FTrace.hpp" #include "../Utils/FTic.hpp" +#include "../Utils/FMPILog.hpp" #include "../Utils/FGlobal.hpp" #include "../Containers/FBoolArray.hpp" @@ -61,8 +62,8 @@ */ template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass> class FFmmAlgorithmThreadProc : public FAbstractAlgorithm { - - // const static int MaxSizePerCell = CellClass::GetSize(); + // Can be deleted + // const static int MaxSizePerCell = CellClass::GetSize(); OctreeClass* const tree; //< The octree to work on KernelClass** kernels; //< The kernels @@ -78,6 +79,8 @@ class FFmmAlgorithmThreadProc : public FAbstractAlgorithm { const int idProcess; //< Id of current process const int OctreeHeight; //<Height of the tree + + FMPILog* sharedLogger; //< Shared Logger to store all the information needed /** An interval is the morton index interval * that a proc use (it holds data in this interval) @@ -96,7 +99,6 @@ class FFmmAlgorithmThreadProc : public FAbstractAlgorithm { return workingIntervalsPerLevel[OctreeHeight * proc + level]; } - public: /** Get current proc interval at level */ Interval& getWorkingInterval( int level){ @@ -117,8 +119,9 @@ public: : tree(inTree) , kernels(0), comm(inComm), iterArray(nullptr),numberOfLeafs(0), MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()), OctreeHeight(tree->getHeight()),intervals(new Interval[inComm.processCount()]), - workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]){ - + workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) + { + FLOG(sharedLogger = new FMPILog(comm)); FAssertLF(tree, "tree cannot be null"); this->kernels = new KernelClass*[MaxThreads]; @@ -146,7 +149,7 @@ public: */ void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){ FTRACE( FTrace::FFunction functionTrace( __FUNCTION__, "Fmm" , __FILE__ , __LINE__ ) ); - + FLOG(sharedLogger->newCounter("Prolog")); // Count leaf this->numberOfLeafs = 0; { @@ -179,7 +182,6 @@ public: octreeIterator.gotoBottomLeft(); octreeIterator.moveUp(); - //Da fck is dat ?! MortonIndex currentLimit = intervals[idProcess-1].max >> 3; for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){ @@ -198,6 +200,8 @@ public: delete[] myIntervals; } + FLOG(sharedLogger->tac("Prolog")); + // run; if(operationsToProceed & FFmmP2M) bottomPass(); @@ -209,6 +213,7 @@ public: if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass(); + // delete array delete [] iterArray; iterArray = 0; @@ -224,8 +229,8 @@ private: void bottomPass(){ FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) ); FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) ); - FLOG(FTic counterTime); - + //FLOG(sharedLogger->newCounter(std::string("counterTimeP2M"))); + typename OctreeClass::Iterator octreeIterator(tree); // Iterate on leafs @@ -235,7 +240,8 @@ private: iterArray[leafs++] = octreeIterator; } while(octreeIterator.moveRight()); - FLOG(FTic computationCounter); + //FLOG(sharedLogger->newCounter("ParallelSectionP2M")); + //FLOG(sharedLogger->newCounter("ParallelSection")); #pragma omp parallel { KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; @@ -244,11 +250,11 @@ private: myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc()); } } - FLOG(computationCounter.tac()); - - FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = " << counterTime.tacAndElapsed() << " s)\n" ); - FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" ); - + //FLOG(sharedLogger->tac("ParallelSection")); + //FLOG(sharedLogger->tac("ParallelSectionP2M")); + //FLOG(sharedLogger->tac("counterTimeP2M")); + // FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = " << counterTime.tacAndElapsed() << " s)\n" ); + // FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" ); } ///////////////////////////////////////////////////////////////////////////// @@ -262,9 +268,9 @@ private: FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); ); FLOG(FTic counterTime); FLOG(FTic computationCounter); - FLOG(FTic prepareCounter); - FLOG(FTic waitCounter); - + FLOG(FTic singleCounter); + FLOG(FTic parallelCounter); + // Start from leal level - 1 typename OctreeClass::Iterator octreeIterator(tree); octreeIterator.gotoBottomLeft(); @@ -286,10 +292,9 @@ private: CellClass recvBufferCells[8]; int firstProcThatSend = idProcess + 1; - - // for each levels + FLOG(computationCounter.tic()); + //Loop for work for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){ - // No more work for me if(idProcess != 0 && getWorkingInterval((idxLevel+1), idProcess).max <= getWorkingInterval((idxLevel+1), idProcess - 1).max){ break; @@ -303,141 +308,154 @@ private: } while(octreeIterator.moveRight()); avoidGotoLeftIterator.moveUp(); octreeIterator = avoidGotoLeftIterator; - - // We may need to send something - int iterRequests = 0; + int cellsToSend = -1; + int iterRequests = 0; + + int endIndex = numberOfCells; + //Test if i'm not the last, and I need st to compute my last M2M + if((idProcess != nbProcess-1) && (getWorkingInterval(idxLevel+1,idProcess+1)).min <= getWorkingInterval(idxLevel+1,idProcess).max){ + endIndex--; + } while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).min){ ++cellsToSend; } - - FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) ); - - FLOG(prepareCounter.tic()); - if(idProcess != 0 - && (getWorkingInterval((idxLevel+1), idProcess).min >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).max >>3)){ - - char state = 0; - sendBuffer.write(state); - - const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild(); - for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ - if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).min <= child[idxChild]->getMortonIndex() ){ - child[idxChild]->serializeUp(sendBuffer); - state = char(state | (0x1 << idxChild)); + FLOG(parallelCounter.tic()); +#pragma omp parallel + { + //This single section is supposed post and receive the comms, and then do the M2M associated with its. +#pragma omp single + { + FLOG(singleCounter.tic()); + //Datas needed in several parts of the section + bool hasToReceive; + int endProcThatSend; + + //Post Send + if(idProcess != 0 + && (getWorkingInterval((idxLevel+1), idProcess).min >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).max >>3)){ - } - } - sendBuffer.writeAt(0,state); - - while( sendToProc && iterArray[cellsToSend].getCurrentGlobalIndex() <= getWorkingInterval(idxLevel , sendToProc - 1).max){ - --sendToProc; - } - - MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, sendToProc, - FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]); - - } - // We may need to receive something - bool hasToReceive = false; - int endProcThatSend = firstProcThatSend; - - if(idProcess != nbProcess - 1){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M - while(firstProcThatSend < nbProcess - && (getWorkingInterval((idxLevel+1), firstProcThatSend).max) <= (getWorkingInterval((idxLevel+1), idProcess).max)){ - // Second condition :: while firstProcThatSend max morton index is < to myself max interval - ++firstProcThatSend; - } - - if(firstProcThatSend < nbProcess && - (getWorkingInterval((idxLevel+1), firstProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3) ){ - - endProcThatSend = firstProcThatSend; - - while( endProcThatSend < nbProcess && - (getWorkingInterval((idxLevel+1) ,endProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3)){ - ++endProcThatSend; - } - - - if(firstProcThatSend != endProcThatSend){ - hasToReceive = true; + char state = 0; + sendBuffer.write(state); + + const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild(); + for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){ + if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).min <= child[idxChild]->getMortonIndex() ){ + child[idxChild]->serializeUp(sendBuffer); + state = char(state | (0x1 << idxChild)); + + } + } + sendBuffer.writeAt(0,state); - for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){ - MPI_Irecv(&recvBuffer.data()[idxProc * recvBufferOffset], recvBufferOffset, MPI_PACKED, - idxProc, FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]); + while( sendToProc && iterArray[cellsToSend].getCurrentGlobalIndex() <= getWorkingInterval(idxLevel , sendToProc - 1).max){ + --sendToProc; } + + MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, sendToProc, + FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]); + } - } - } - - FLOG(prepareCounter.tac()); - FTRACE( regionTrace.end() ); - - // Compute - const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells); - FLOG(computationCounter.tic()); -#pragma omp parallel - { - KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]); -#pragma omp for nowait - for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){ - myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); - } - } - FLOG(computationCounter.tac()); - - // Are we sending or waiting anything? - if(iterRequests){ - FLOG(waitCounter.tic()); - MPI_Waitall( iterRequests, requests, status); - FLOG(waitCounter.tac()); - - // we were receiving data - if( hasToReceive ){ - CellClass* currentChild[8]; - memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*)); - - // retreive data and merge my child and the child from others - for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){ - recvBuffer.seek(idxProc * recvBufferOffset); - int state = int(recvBuffer.getValue<char>()); + //Post receive + { + // We may need to receive something + hasToReceive = false; + endProcThatSend = firstProcThatSend; - int position = 0; - while( state && position < 8){ - while(!(state & 0x1)){ - state >>= 1; - ++position; + if(idProcess != nbProcess - 1){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M + while(firstProcThatSend < nbProcess + && (getWorkingInterval((idxLevel+1), firstProcThatSend).max) <= (getWorkingInterval((idxLevel+1), idProcess).max)){ + // Second condition :: while firstProcThatSend max morton index is < to myself max interval + ++firstProcThatSend; } - FAssertLF(!currentChild[position], "Already has a cell here"); - - recvBufferCells[position].deserializeUp(recvBuffer); - currentChild[position] = (CellClass*) &recvBufferCells[position]; - - state >>= 1; - ++position; + if(firstProcThatSend < nbProcess && + (getWorkingInterval((idxLevel+1), firstProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3) ){ + + endProcThatSend = firstProcThatSend; + + while( endProcThatSend < nbProcess && + (getWorkingInterval((idxLevel+1) ,endProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3)){ + ++endProcThatSend; + } + + + if(firstProcThatSend != endProcThatSend){ + hasToReceive = true; + + for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){ + MPI_Irecv(&recvBuffer.data()[idxProc * recvBufferOffset], recvBufferOffset, MPI_PACKED, + idxProc, FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]); + } + } + } } } + //Wait For the comms, and do the work + { + // Are we sending or waiting anything? + if(iterRequests){ + //FLOG(sharedLogger->tic("waitCounterM2M")); + MPI_Waitall( iterRequests, requests, status); + //FLOG(sharedLogger->tac("waitCounterM2M")); + // we were receiving data + if( hasToReceive ){ + CellClass* currentChild[8]; + memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*)); + + // retreive data and merge my child and the child from others + for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){ + recvBuffer.seek(idxProc * recvBufferOffset); + int state = int(recvBuffer.getValue<char>()); + + int position = 0; + while( state && position < 8){ + while(!(state & 0x1)){ + state >>= 1; + ++position; + } + + FAssertLF(!currentChild[position], "Already has a cell here"); + + recvBufferCells[position].deserializeUp(recvBuffer); + currentChild[position] = (CellClass*) &recvBufferCells[position]; + + state >>= 1; + ++position; + } + } - // Finally compute - FLOG(computationCounter.tic()); - (*kernels[0]).M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel); - FLOG(computationCounter.tac()); - - firstProcThatSend = endProcThatSend - 1; + // Finally compute + //FLOG(sharedLogger->tic("ParallelSectionM2M")); + (*kernels[0]).M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel); + //FLOG(sharedLogger->tac("ParallelSectionM2M")); + + firstProcThatSend = endProcThatSend - 1; + } + } + } + sendBuffer.reset(); + recvBuffer.seek(0); + FLOG(singleCounter.tac()); + }//End Of Single section + + KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]); +#pragma omp for nowait + for( int idxCell = cellsToSend+1 ; idxCell < endIndex ; ++idxCell){ + myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); } - } - sendBuffer.reset(); - recvBuffer.seek(0); + }//End of parallel section + FLOG(parallelCounter.tac()); } + - - FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = " << counterTime.tacAndElapsed() << " s)\n" ); - FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" ); + FLOG(counterTime.tac()); + FLOG(computationCounter.tac()); + FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = " << counterTime.elapsed() << " s)\n" ); + FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" ); + FLOG( FLog::Controller << "\t\t Wait : " << singleCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller << "\t\t Wait : " << parallelCounter.cumulated() << " s\n" ); } ///////////////////////////////////////////////////////////////////////////// @@ -452,10 +470,11 @@ private: FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); ); FLOG(FTic counterTime); FLOG(FTic computationCounter); - FLOG(FTic sendCounter); - FLOG(FTic receiveCounter); - FLOG(FTic prepareCounter); + FLOG(FTic singleCounter); FLOG(FTic gatherCounter); + FLOG(FTic m2lSelf); + FLOG(FTic m2lFar); + FLOG(FTic sendCounter); ////////////////////////////////////////////////////////////////// // First know what to send to who @@ -470,307 +489,326 @@ private: FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight]; memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight); - { - FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) ); - FLOG(prepareCounter.tic()); - - // To know if a leaf has been already sent to a proc - bool*const alreadySent = new bool[nbProcess]; - memset(alreadySent, 0, sizeof(bool) * nbProcess); + //Variables needed in multiple omp sections + int iterRequest; + MPI_Request * requests; + MPI_Status * status; + int* globalReceiveMap; + FMpiBufferWriter** sendBuffer; + FMpiBufferReader** recvBuffer; - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.moveDown(); - typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); - // for each levels - for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ - if(idProcess != 0 - && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ +#pragma omp parallel + { +#pragma omp single nowait + { + FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) ); + FLOG(singleCounter.tic()); + + // To know if a leaf has been already sent to a proc + bool*const alreadySent = new bool[nbProcess]; + memset(alreadySent, 0, sizeof(bool) * nbProcess); + + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.moveDown(); + typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); + // for each levels + for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ + if(idProcess != 0 + && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ + avoidGotoLeftIterator.moveDown(); + octreeIterator = avoidGotoLeftIterator; + + continue; + } + + int numberOfCells = 0; + + while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ + octreeIterator.moveRight(); + } + + // for each cells + do{ + iterArray[numberOfCells] = octreeIterator; + ++numberOfCells; + } while(octreeIterator.moveRight()); avoidGotoLeftIterator.moveDown(); octreeIterator = avoidGotoLeftIterator; - - continue; - } - - int numberOfCells = 0; - - while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ - octreeIterator.moveRight(); - } - - // for each cells - do{ - iterArray[numberOfCells] = octreeIterator; - ++numberOfCells; - } while(octreeIterator.moveRight()); - avoidGotoLeftIterator.moveDown(); - octreeIterator = avoidGotoLeftIterator; - - leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells); - - - // Which cell potentialy needs other data and in the same time - // are potentialy needed by other - MortonIndex neighborsIndexes[189]; - for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ - // Find the M2L neigbors of a cell - const int counter = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes); - - memset(alreadySent, false, sizeof(bool) * nbProcess); - bool needOther = false; - // Test each negibors to know which one do not belong to us - for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ - if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).min - || (getWorkingInterval(idxLevel , idProcess).max) < neighborsIndexes[idxNeigh]){ - int procToReceive = idProcess; - while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).min ){ - --procToReceive; - } - while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).max) < neighborsIndexes[idxNeigh]){ - ++procToReceive; - } - // Maybe already sent to that proc? - if( !alreadySent[procToReceive] - && getWorkingInterval(idxLevel , procToReceive).min <= neighborsIndexes[idxNeigh] - && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).max){ - - alreadySent[procToReceive] = true; - - needOther = true; - - toSend[idxLevel * nbProcess + procToReceive].push(iterArray[idxCell]); - ++indexToSend[idxLevel * nbProcess + procToReceive]; + + leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells); + + + // Which cell potentialy needs other data and in the same time + // are potentialy needed by other + MortonIndex neighborsIndexes[189]; + for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ + // Find the M2L neigbors of a cell + const int counter = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes); + + memset(alreadySent, false, sizeof(bool) * nbProcess); + bool needOther = false; + // Test each negibors to know which one do not belong to us + for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){ + if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).min + || (getWorkingInterval(idxLevel , idProcess).max) < neighborsIndexes[idxNeigh]){ + int procToReceive = idProcess; + while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).min ){ + --procToReceive; + } + while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).max) < neighborsIndexes[idxNeigh]){ + ++procToReceive; + } + // Maybe already sent to that proc? + if( !alreadySent[procToReceive] + && getWorkingInterval(idxLevel , procToReceive).min <= neighborsIndexes[idxNeigh] + && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).max){ + + alreadySent[procToReceive] = true; + + needOther = true; + + toSend[idxLevel * nbProcess + procToReceive].push(iterArray[idxCell]); + ++indexToSend[idxLevel * nbProcess + procToReceive]; + } } } + if(needOther){ + leafsNeedOther[idxLevel]->set(idxCell,true); + } + } - if(needOther){ - leafsNeedOther[idxLevel]->set(idxCell,true); - } - + } - } - FLOG(prepareCounter.tac()); - - delete[] alreadySent; - } - - ////////////////////////////////////////////////////////////////// - // Gather this information - ////////////////////////////////////////////////////////////////// - - FLOG(gatherCounter.tic()); - // All process say to each others - // what the will send to who - int*const globalReceiveMap = new int[nbProcess * nbProcess * OctreeHeight]; - memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess * OctreeHeight); - FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, comm.getComm()), __LINE__ ); - FLOG(gatherCounter.tac()); - - - ////////////////////////////////////////////////////////////////// - // Send and receive for real - ////////////////////////////////////////////////////////////////// - - FLOG(sendCounter.tic()); - // Then they can send and receive (because they know what they will receive) - // To send in asynchrone way - MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight]; - MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight]; - int iterRequest = 0; - - const int SizeOfCellToSend = sizeof(MortonIndex) + sizeof(int) + MaxSizePerCell; - - FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight]; - memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight); + delete[] alreadySent; + - FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight]; - memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight); + ////////////////////////////////////////////////////////////////// + // Gather this information + ////////////////////////////////////////////////////////////////// + + FLOG(gatherCounter.tic()); + // All process say to each others + // what the will send to who + globalReceiveMap = new int[nbProcess * nbProcess * OctreeHeight]; + memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess * OctreeHeight); + FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, comm.getComm()), __LINE__ ); + FLOG(gatherCounter.tac()); + + ////////////////////////////////////////////////////////////////// + // Send and receive for real + ////////////////////////////////////////////////////////////////// + + FLOG(sendCounter.tic()); + // Then they can send and receive (because they know what they will receive) + // To send in asynchrone way + requests = new MPI_Request[2 * nbProcess * OctreeHeight]; + status = new MPI_Status[2 * nbProcess * OctreeHeight]; + iterRequest = 0; + + const int SizeOfCellToSend = int(sizeof(MortonIndex) + sizeof(int) + MaxSizePerCell); + + sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight]; + memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight); + + recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight]; + memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight); - for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - const int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc]; - if(toSendAtProcAtLevel != 0){ - sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),toSendAtProcAtLevel * SizeOfCellToSend); - - for(int idxLeaf = 0 ; idxLeaf < toSendAtProcAtLevel; ++idxLeaf){ - const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex(); - sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex); - toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]); + + for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + const int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc]; + if(toSendAtProcAtLevel != 0){ + sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),toSendAtProcAtLevel * SizeOfCellToSend); + + for(int idxLeaf = 0 ; idxLeaf < toSendAtProcAtLevel; ++idxLeaf){ + const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex(); + sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex); + toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]); + } + + FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(), + sendBuffer[idxLevel * nbProcess + idxProc]->getSize(),MPI_PACKED, idxProc, + FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + } + + const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; + if(toReceiveFromProcAtLevel){ + recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),toReceiveFromProcAtLevel * SizeOfCellToSend); + + FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(), + recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), MPI_PACKED,idxProc, + FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + } } - - FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(), - sendBuffer[idxLevel * nbProcess + idxProc]->getSize(),MPI_PACKED, idxProc, - FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); } - - const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; - if(toReceiveFromProcAtLevel){ - recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),toReceiveFromProcAtLevel * SizeOfCellToSend); - - FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(), - recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), MPI_PACKED,idxProc, - FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + FLOG(sendCounter.tac()); + { + ////////////////////////////////////////////////////////////////// + // Wait received data and compute + ////////////////////////////////////////////////////////////////// + + // Wait to receive every things (and send every things) + MPI_Waitall(iterRequest, requests, status); } - } - } - FLOG(sendCounter.tac()); - - ////////////////////////////////////////////////////////////////// - // Do M2L - ////////////////////////////////////////////////////////////////// - - { - FTRACE( FTrace::FRegion regionTrace("Compute", __FUNCTION__ , __FILE__ , __LINE__) ); - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.moveDown(); - typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); - // Now we can compute all the data - // for each levels - for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ - if(idProcess != 0 - && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ - + FLOG(singleCounter.tac()); + }//End of Single section + + ////////////////////////////////////////////////////////////////// + // Do M2L SELF + ////////////////////////////////////////////////////////////////// + FLOG(m2lSelf.tic()); + { + FTRACE( FTrace::FRegion regionTrace("Compute", __FUNCTION__ , __FILE__ , __LINE__) ); + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.moveDown(); + typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); + // Now we can compute all the data + // for each levels + for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ + if(idProcess != 0 + && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ + + avoidGotoLeftIterator.moveDown(); + octreeIterator = avoidGotoLeftIterator; + + continue; + } + + int numberOfCells = 0; + while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ + octreeIterator.moveRight(); + } + // for each cells + do{ + iterArray[numberOfCells] = octreeIterator; + ++numberOfCells; + } while(octreeIterator.moveRight()); avoidGotoLeftIterator.moveDown(); octreeIterator = avoidGotoLeftIterator; + + FLOG(computationCounter.tic()); + //FLOG(sharedLogger->tic("ParallelSection")); + //FLOG(sharedLogger->newCounter("TestParallelM2L")); - continue; - } - - int numberOfCells = 0; - while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ - octreeIterator.moveRight(); - } - // for each cells - do{ - iterArray[numberOfCells] = octreeIterator; - ++numberOfCells; - } while(octreeIterator.moveRight()); - avoidGotoLeftIterator.moveDown(); - octreeIterator = avoidGotoLeftIterator; - - FLOG(computationCounter.tic()); -#pragma omp parallel - { - KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; - const CellClass* neighbors[343]; - -#pragma omp for schedule(dynamic) nowait - for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ - const int counter = tree->getInteractionNeighbors(neighbors, iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel); - if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel); + { + KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; + const CellClass* neighbors[343]; + +#pragma omp for schedule(dynamic) //nowait + for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ + const int counter = tree->getInteractionNeighbors(neighbors, iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel); + if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel); + } + myThreadkernels->finishedLevelM2L(idxLevel); } - - myThreadkernels->finishedLevelM2L(idxLevel); + //FLOG(sharedLogger->tac("TestParallelM2L")); + FLOG(computationCounter.tac()); + //FLOG(sharedLogger->tac("ParallelSection")); } - FLOG(computationCounter.tac()); } + FLOG(m2lSelf.tac()); } + + + FTRACE( FTrace::FRegion regionTrace("Compute Received data", __FUNCTION__ , __FILE__ , __LINE__) ); + + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.moveDown(); + typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); + // compute the second time + // for each levels + for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ + if(idProcess != 0 + && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ + + avoidGotoLeftIterator.moveDown(); + octreeIterator = avoidGotoLeftIterator; + + continue; + } - ////////////////////////////////////////////////////////////////// - // Wait received data and compute - ////////////////////////////////////////////////////////////////// - - // Wait to receive every things (and send every things) - MPI_Waitall(iterRequest, requests, status); - - { - FTRACE( FTrace::FRegion regionTrace("Compute Received data", __FUNCTION__ , __FILE__ , __LINE__) ); - FLOG(receiveCounter.tic()); - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.moveDown(); - typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); - // compute the second time - // for each levels - for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ - if(idProcess != 0 - && getWorkingInterval(idxLevel, idProcess).max <= getWorkingInterval(idxLevel, idProcess - 1).max){ - - avoidGotoLeftIterator.moveDown(); - octreeIterator = avoidGotoLeftIterator; - - continue; - } - - // put the received data into a temporary tree - FLightOctree<CellClass> tempTree; - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; + // put the received data into a temporary tree + FLightOctree<CellClass> tempTree; + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess]; - for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){ - const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->FMpiBufferReader::getValue<MortonIndex>(); + for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){ + const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->FMpiBufferReader::getValue<MortonIndex>(); - CellClass* const newCell = new CellClass; - newCell->setMortonIndex(cellIndex); - newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]); + CellClass* const newCell = new CellClass; + newCell->setMortonIndex(cellIndex); + newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]); - tempTree.insertCell(cellIndex, idxLevel, newCell); - } + tempTree.insertCell(cellIndex, idxLevel, newCell); } + } + // take cells from our octree only if they are + // linked to received data + int numberOfCells = 0; + int realCellId = 0; - // take cells from our octree only if they are - // linked to received data - int numberOfCells = 0; - int realCellId = 0; - - while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ - octreeIterator.moveRight(); + while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).min){ + octreeIterator.moveRight(); + } + // for each cells + do{ + // copy cells that need data from others + if(leafsNeedOther[idxLevel]->get(realCellId++)){ + iterArray[numberOfCells++] = octreeIterator; } - // for each cells - do{ - // copy cells that need data from others - if(leafsNeedOther[idxLevel]->get(realCellId++)){ - iterArray[numberOfCells++] = octreeIterator; - } - } while(octreeIterator.moveRight()); - avoidGotoLeftIterator.moveDown(); - octreeIterator = avoidGotoLeftIterator; + } while(octreeIterator.moveRight()); + avoidGotoLeftIterator.moveDown(); + octreeIterator = avoidGotoLeftIterator; - delete leafsNeedOther[idxLevel]; - leafsNeedOther[idxLevel] = 0; + delete leafsNeedOther[idxLevel]; + leafsNeedOther[idxLevel] = 0; - // Compute this cells - FLOG(computationCounter.tic()); + // Compute this cells + FLOG(computationCounter.tic()); + //FLOG(sharedLogger->tic("ParallelSection")); + FLOG(m2lFar.tic()); #pragma omp parallel - { - KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; - MortonIndex neighborsIndex[189]; - int neighborsPosition[189]; - const CellClass* neighbors[343]; + { + KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; + MortonIndex neighborsIndex[189]; + int neighborsPosition[189]; + const CellClass* neighbors[343]; #pragma omp for schedule(dynamic) nowait - for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ - // compute indexes - memset(neighbors, 0, 343 * sizeof(CellClass*)); - const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition); - - int counter = 0; - // does we receive this index from someone? - for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){ - if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).min) - || (getWorkingInterval(idxLevel , idProcess).max) < neighborsIndex[idxNeig]){ - - CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel); - - if(otherCell){ - //otherCell->setMortonIndex(neighborsIndex[idxNeig]); - neighbors[ neighborsPosition[idxNeig] ] = otherCell; - ++counter; - } + for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){ + // compute indexes + memset(neighbors, 0, 343 * sizeof(CellClass*)); + const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition); + + int counter = 0; + // does we receive this index from someone? + for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){ + if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).min) + || (getWorkingInterval(idxLevel , idProcess).max) < neighborsIndex[idxNeig]){ + + CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel); + + if(otherCell){ + //otherCell->setMortonIndex(neighborsIndex[idxNeig]); + neighbors[ neighborsPosition[idxNeig] ] = otherCell; + ++counter; } } - // need to compute - if(counter){ - myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel); - } } - - myThreadkernels->finishedLevelM2L(idxLevel); + // need to compute + if(counter){ + myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel); + } } - FLOG(computationCounter.tac()); + + myThreadkernels->finishedLevelM2L(idxLevel); } - FLOG(receiveCounter.tac()); + //FLOG(sharedLogger->tac("ParallelSection")); + FLOG(computationCounter.tac()); + FLOG(m2lFar.tac()); } for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){ @@ -790,16 +828,17 @@ private: FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = " << counterTime.tacAndElapsed() << " s)\n" ); FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" ); - FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" ); + FLOG( FLog::Controller << "\t\t M2L Self : " << m2lSelf.cumulated() << " s\n" ); + FLOG( FLog::Controller << "\t\t M2L Far : " << m2lFar.cumulated() << " s\n" ); + FLOG( FLog::Controller << "\t\t M2L Gather : " << gatherCounter.elapsed() << " s\n" ); + FLOG( FLog::Controller << "\t\t M2L Send : " << sendCounter.elapsed() << " s\n" ); } ////////////////////////////////////////////////////////////////// // ---------------- L2L --------------- ////////////////////////////////////////////////////////////////// - + void downardPass(){ // second L2L const int MaxSizePerCell = CellClass::GetSize(); FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) ); @@ -813,10 +852,10 @@ private: typename OctreeClass::Iterator octreeIterator(tree); octreeIterator.moveDown(); typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); - + MPI_Request*const requests = new MPI_Request[nbProcess]; MPI_Status*const status = new MPI_Status[nbProcess]; - + const int heightMinusOne = OctreeHeight - 1; FMpiBufferWriter sendBuffer(comm.getComm(),MaxSizePerCell); @@ -847,51 +886,75 @@ private: ++firstCellWork; } - bool needToRecv = false; - int iterRequests = 0; - - FLOG(prepareCounter.tic()); - - // do we need to receive one or zeros cell - if(idProcess != 0 - && (getWorkingInterval((idxLevel + 1) , idProcess).min >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).max >> 3 ) ){ - needToRecv = true; - +#pragma omp parallel + { +#pragma omp single + { + bool needToRecv = false; + int iterRequests = 0; - MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, MPI_ANY_SOURCE, - FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]); - } + FLOG(prepareCounter.tic()); + + // do we need to receive one or zeros cell + if(idProcess != 0 + && (getWorkingInterval((idxLevel + 1) , idProcess).min >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).max >> 3 ) ){ + needToRecv = true; - if(idProcess != nbProcess - 1){ - int firstProcThatRecv = idProcess + 1; - while( firstProcThatRecv < nbProcess && - getWorkingInterval((idxLevel + 1) , firstProcThatRecv).max <= getWorkingInterval((idxLevel+1) , idProcess).max){ - ++firstProcThatRecv; - } + MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, MPI_ANY_SOURCE, + FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]); + } - int endProcThatRecv = firstProcThatRecv; - while( endProcThatRecv < nbProcess && - (getWorkingInterval((idxLevel + 1) , endProcThatRecv).min >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).max >> 3) ){ - ++endProcThatRecv; - } + + if(idProcess != nbProcess - 1){ + int firstProcThatRecv = idProcess + 1; + while( firstProcThatRecv < nbProcess && + getWorkingInterval((idxLevel + 1) , firstProcThatRecv).max <= getWorkingInterval((idxLevel+1) , idProcess).max){ + ++firstProcThatRecv; + } - if(firstProcThatRecv != endProcThatRecv){ - iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer); + int endProcThatRecv = firstProcThatRecv; + while( endProcThatRecv < nbProcess && + (getWorkingInterval((idxLevel + 1) , endProcThatRecv).min >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).max >> 3) ){ + ++endProcThatRecv; + } + + if(firstProcThatRecv != endProcThatRecv){ + iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer); - for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){ + for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){ - MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProc, - FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]); + MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProc, + FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]); + } + + } } + + // are we sending or receiving? + if(iterRequests){ + + // process + FLOG(waitCounter.tic()); + MPI_Waitall( iterRequests, requests, status); + FLOG(waitCounter.tac()); + + if(needToRecv){ + // Need to compute + FLOG(computationCounter.tic()); + iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer); + + kernels[0]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel); + FLOG(computationCounter.tac()); + } + } + + }//End Of Single Section + FLOG(prepareCounter.tac()); - } - } - FLOG(prepareCounter.tac()); - - FLOG(computationCounter.tic()); -#pragma omp parallel - { + FLOG(computationCounter.tic()); + //FLOG(sharedLogger->tic("ParallelSection")); + //#pragma omp parallel KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]); #pragma omp for nowait for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){ @@ -899,24 +962,7 @@ private: } } FLOG(computationCounter.tac()); - - // are we sending or receiving? - if(iterRequests){ - - // process - FLOG(waitCounter.tic()); - MPI_Waitall( iterRequests, requests, status); - FLOG(waitCounter.tac()); - - if(needToRecv){ - // Need to compute - FLOG(computationCounter.tic()); - iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer); - - kernels[0]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel); - FLOG(computationCounter.tac()); - } - } + //FLOG(sharedLogger->tac("ParallelSection")); sendBuffer.reset(); recvBuffer.seek(0); @@ -941,6 +987,7 @@ private: ContainerClass* targets; ContainerClass* sources; }; + /** P2P */ void directPass(){ FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) ); @@ -949,9 +996,11 @@ private: FLOG( FTic prepareCounter); FLOG( FTic gatherCounter); FLOG( FTic waitCounter); + FLOG(FTic computation2Counter); + FLOG(FTic computationCounter); /////////////////////////////////////////////////// - // Prepare data to send receive + // Prepar data to send receive /////////////////////////////////////////////////// FLOG(prepareCounter.tic()); @@ -973,158 +1022,40 @@ private: */ int*const globalReceiveMap = new int[nbProcess * nbProcess]; memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess); + + FVector<LeafData> * leafsNeedOtherData; + LeafData* leafsDataArray; + OctreeClass* otherP2Ptree; FBoolArray leafsNeedOther(this->numberOfLeafs); int countNeedOther = 0; - - { - FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) ); - // Copy leafs - { - typename OctreeClass::Iterator octreeIterator(tree); - octreeIterator.gotoBottomLeft(); - int idxLeaf = 0; - do{ - this->iterArray[idxLeaf++] = octreeIterator; - } while(octreeIterator.moveRight()); - } - - // Number of cells max - //const int limite = 1 << (this->OctreeHeight - 1); - // pointer to send - FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess]; - - // array that will be send to other processus for them to build the globalReceiveMap - int partsToSend[nbProcess]; - memset(partsToSend, 0, sizeof(int) * nbProcess); - - // To know if a leaf has been already sent to a proc - int alreadySent[nbProcess]; - - //Will store the indexes of the neighbors of current cell - MortonIndex indexesNeighbors[26]; - //Obviously unused - //int uselessIndexArray[26]; - - for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){ - memset(alreadySent, 0, sizeof(int) * nbProcess); - bool needOther = false; - //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount - const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors); - //Loop over the neighbor leafs - for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){ - //Test if leaf belongs to someone else (false if it's mine) - if(indexesNeighbors[idxNeigh] < (intervals[idProcess].min) || (intervals[idProcess].max) < indexesNeighbors[idxNeigh]){ - needOther = true; - - // find the proc that will need current leaf - int procToReceive = idProcess; - while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].min){ - --procToReceive; //scroll process "before" current process - } - - while( procToReceive != nbProcess - 1 && (intervals[procToReceive].max) < indexesNeighbors[idxNeigh]){ - ++procToReceive;//scroll process "after" current process - } - // Test : Not Already Send && USELESS TEST ? - if( !alreadySent[procToReceive] && intervals[procToReceive].min <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].max){ - - alreadySent[procToReceive] = 1; - toSend[procToReceive].push( iterArray[idxLeaf] ); - partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize(); - partsToSend[procToReceive] += int(sizeof(MortonIndex)); - } - } - } - - if(needOther){ //means that something need to be sent (or received) - leafsNeedOther.set(idxLeaf,true); - ++countNeedOther; - } - } - - // No idea why it is mandatory there, could it be a few line before, - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - if(partsToSend[idxProc]){ - partsToSend[idxProc] += int(sizeof(int)); - } - } - - //Share to all processus globalReceiveMap - FLOG(gatherCounter.tic()); - FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ ); - FLOG(gatherCounter.tac()); - - {//TODO : remove - //Print the globalReceiveMap for Process 0 - // if(idProcess == 0) - // { - // printf("\n Proc 0 :: \n"); - // for(int u = 0 ; u < nbProcess ; ++u){ - // for(int v = 0 ; v < nbProcess ; ++v){ - // printf("\t %d",globalReceiveMap[u*nbProcess+v]); - // } - // printf("\n"); - // } - // } - } - - - //Prepare receive - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. - //allocate buffer of right size - recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]); - FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(), MPI_PACKED, - idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); - } - } - - nbMessagesToRecv = iterRequest; - // Prepare send - for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ - if(toSend[idxProc].getSize() != 0){ - sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]); - // << is equivalent to write(). - (*sendBuffer[idxProc]) << toSend[idxProc].getSize(); - for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){ - (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex(); - toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]); - } - //TEST BERENGER - //if(sendBuffer[idxProc]->getSize() != partsToSend[idxProc]){ - FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize() , MPI_PACKED , - idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); - - } - } - - delete[] toSend; - } - FLOG(prepareCounter.tac()); - + /////////////////////////////////////////////////// // Prepare data for thread P2P /////////////////////////////////////////////////// - + // init const int LeafIndex = OctreeHeight - 1; const int SizeShape = 3*3*3; - + int shapeLeaf[SizeShape]; memset(shapeLeaf,0,SizeShape*sizeof(int)); + + leafsDataArray = new LeafData[this->numberOfLeafs]; - LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs]; + leafsNeedOtherData = new FVector<LeafData>(countNeedOther); - FVector<LeafData> leafsNeedOtherData(countNeedOther); + // This first part is sequential, we split the datas between + // colors to avoid writing concurrency later with omp threads + // split data { FTRACE( FTrace::FRegion regionTrace( "Split" , __FUNCTION__ , __FILE__ , __LINE__) ); typename OctreeClass::Iterator octreeIterator(tree); octreeIterator.gotoBottomLeft(); - + // to store which shape for each leaf typename OctreeClass::Iterator* const myLeafs = new typename OctreeClass::Iterator[this->numberOfLeafs]; int*const shapeType = new int[this->numberOfLeafs]; @@ -1155,7 +1086,7 @@ private: leafsDataArray[startPosAtShape[shapePosition]].cell = myLeafs[idxInArray].getCurrentCell(); leafsDataArray[startPosAtShape[shapePosition]].targets = myLeafs[idxInArray].getCurrentListTargets(); leafsDataArray[startPosAtShape[shapePosition]].sources = myLeafs[idxInArray].getCurrentListSrc(); - if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData.push(leafsDataArray[startPosAtShape[shapePosition]]); + if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData->push(leafsDataArray[startPosAtShape[shapePosition]]); ++startPosAtShape[shapePosition]; } @@ -1163,143 +1094,288 @@ private: delete[] shapeType; delete[] myLeafs; } - - - ////////////////////////////////////////////////////////// - // Computation P2P that DO NOT need others data - ////////////////////////////////////////////////////////// - FTRACE( FTrace::FRegion regionP2PTrace("Compute P2P", __FUNCTION__ , __FILE__ , __LINE__) ); - - FLOG(FTic computationCounter); - + + //At this point, we start with the parallel section + //One thread will be in charge of communication + //Two comm : AllGather then iSend and IRecv #pragma omp parallel { +#pragma omp single nowait + { + FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) ); + // Copy leafs + { + typename OctreeClass::Iterator octreeIterator(tree); + octreeIterator.gotoBottomLeft(); + int idxLeaf = 0; + do{ + this->iterArray[idxLeaf++] = octreeIterator; + } while(octreeIterator.moveRight()); + } + + // Number of cells max + //const int limite = 1 << (this->OctreeHeight - 1); + // pointer to send + FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess]; + + // array that will be send to other processus for them to build the globalReceiveMap + int partsToSend[nbProcess]; + memset(partsToSend, 0, sizeof(int) * nbProcess); + + // To know if a leaf has been already sent to a proc + int alreadySent[nbProcess]; + + //Will store the indexes of the neighbors of current cell + MortonIndex indexesNeighbors[26]; + //Obviously unused + //int uselessIndexArray[26]; + + for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){ + memset(alreadySent, 0, sizeof(int) * nbProcess); + bool needOther = false; + //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount + const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors); + //Loop over the neighbor leafs + for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){ + //Test if leaf belongs to someone else (false if it's mine) + if(indexesNeighbors[idxNeigh] < (intervals[idProcess].min) || (intervals[idProcess].max) < indexesNeighbors[idxNeigh]){ + needOther = true; + + // find the proc that will need current leaf + int procToReceive = idProcess; + while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].min){ + --procToReceive; //scroll process "before" current process + } + + while( procToReceive != nbProcess - 1 && (intervals[procToReceive].max) < indexesNeighbors[idxNeigh]){ + ++procToReceive;//scroll process "after" current process + } + // Test : Not Already Send && USELESS TEST ? + if( !alreadySent[procToReceive] && intervals[procToReceive].min <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].max){ + + alreadySent[procToReceive] = 1; + toSend[procToReceive].push( iterArray[idxLeaf] ); + partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize(); + partsToSend[procToReceive] += int(sizeof(MortonIndex)); + } + } + } + + if(needOther){ //means that something need to be sent (or received) + leafsNeedOther.set(idxLeaf,true); + ++countNeedOther; + } + } + + // No idea why it is mandatory there, could it be a few line before, + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + if(partsToSend[idxProc]){ + partsToSend[idxProc] += int(sizeof(int)); + } + } + + //Share to all processus globalReceiveMap + FLOG(gatherCounter.tic()); + FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ ); + FLOG(gatherCounter.tac()); + + {//TODO : remove + //Print the globalReceiveMap for Process 0 + // if(idProcess == 0) +// { +// printf("\n Proc 0 :: \n"); +// for(int u = 0 ; u < nbProcess ; ++u){ +// for(int v = 0 ; v < nbProcess ; ++v){ +// printf("\t %d",globalReceiveMap[u*nbProcess+v]); +// } +// printf("\n"); +// } +// } + + } + + + //Prepare receive + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. + //allocate buffer of right size + recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]); + FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(), MPI_PACKED, + idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + } + } + + nbMessagesToRecv = iterRequest; + // Prepare send + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ + if(toSend[idxProc].getSize() != 0){ + sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]); + // << is equivalent to write(). + (*sendBuffer[idxProc]) << toSend[idxProc].getSize(); + for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){ + (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex(); + toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]); + } + //TEST BERENGER + //if(sendBuffer[idxProc]->getSize() != partsToSend[idxProc]){ + FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize() , MPI_PACKED , + idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ ); + + } + } + + delete[] toSend; + ////////////////////////////////////////////////////////// + // Waitsend receive + ////////////////////////////////////////////////////////// + + FLOG(computation2Counter.tic()); + + // Create an octree with leaves from others + otherP2Ptree = new OctreeClass( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() ); + int complete = 0; + int*const indexMessage = new int[nbProcess * 2]; + while( complete != iterRequest){ + memset(indexMessage, 0, sizeof(int) * nbProcess * 2); + int countMessages = 0; + // Wait data + FLOG(waitCounter.tic()); + MPI_Waitsome(iterRequest, requests, &countMessages, indexMessage, status); + + FLOG(waitCounter.tac()); + complete += countMessages; + + + for(int idxRcv = 0 ; idxRcv < countMessages ; ++idxRcv){ + if( indexMessage[idxRcv] < nbMessagesToRecv ){ + const int idxProc = status[idxRcv].MPI_SOURCE; + int nbLeaves; + (*recvBuffer[idxProc]) >> nbLeaves; + for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ + MortonIndex leafIndex; + (*recvBuffer[idxProc]) >> leafIndex; + otherP2Ptree->createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc])); + } + delete recvBuffer[idxProc]; + recvBuffer[idxProc] = 0; + } + } + } + delete[] indexMessage; + + }//End single section + + FLOG(prepareCounter.tac()); + + ////////////////////////////////////////////////////////// + // Computation P2P that DO NOT need others data + ////////////////////////////////////////////////////////// + FTRACE( FTrace::FRegion regionP2PTrace("Compute P2P", __FUNCTION__ , __FILE__ , __LINE__) ); + + FLOG(computationCounter.tic()); + //FLOG(sharedLogger->tic("ParallelSection")); + //#pragma omp parallel + //{ KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]); // There is a maximum of 26 neighbors ContainerClass* neighbors[27]; int previous = 0; - + for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){ const int endAtThisShape = shapeLeaf[idxShape] + previous; - -#pragma omp for + // #pragma omp single + // { + // char file[5]; + // sprintf(file,"re%d",idProcess); + // FILE * fd = fopen(file,"a+"); + // fprintf(fd,"Proc %d \t, Color : %d, from %d to %d, total %d \n",idProcess,idxShape,previous,endAtThisShape,endAtThisShape-previous); + // fclose(fd); + // } +#pragma omp for schedule(auto) for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; ++idxLeafs){ LeafData& currentIter = leafsDataArray[idxLeafs]; myThreadkernels.L2P(currentIter.cell, currentIter.targets); - + // need the current particles and neighbors particles const int counter = tree->getLeafsNeighbors(neighbors, currentIter.coord, LeafIndex); myThreadkernels.P2P( currentIter.coord,currentIter.targets, currentIter.sources, neighbors, counter); } - + previous = endAtThisShape; } - } - FLOG(computationCounter.tac()); - FTRACE( regionP2PTrace.end() ); - - ////////////////////////////////////////////////////////// - // Waitsend receive - ////////////////////////////////////////////////////////// - - FLOG(FTic computation2Counter); - - // Create an octree with leaves from others - OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() ); - int complete = 0; - int*const indexMessage = new int[nbProcess * 2]; - while( complete != iterRequest){ - memset(indexMessage, 0, sizeof(int) * nbProcess * 2); - int countMessages = 0; - // Wait data - FLOG(waitCounter.tic()); - MPI_Waitsome(iterRequest, requests, &countMessages, indexMessage, status); + //} + + //FLOG(sharedLogger->tac("ParallelSection")); + FLOG(computationCounter.tac()); + FTRACE( regionP2PTrace.end() ); - FLOG(waitCounter.tac()); - complete += countMessages; - - - for(int idxRcv = 0 ; idxRcv < countMessages ; ++idxRcv){ - if( indexMessage[idxRcv] < nbMessagesToRecv ){ - const int idxProc = status[idxRcv].MPI_SOURCE; - int nbLeaves; - (*recvBuffer[idxProc]) >> nbLeaves; - for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){ - MortonIndex leafIndex; - (*recvBuffer[idxProc]) >> leafIndex; - otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc])); - } - delete recvBuffer[idxProc]; - recvBuffer[idxProc] = 0; - } - } - } - delete[] indexMessage; - - ////////////////////////////////////////////////////////// - // Computation P2P that need others data - ////////////////////////////////////////////////////////// - - FTRACE( FTrace::FRegion regionOtherTrace("Compute P2P Other", __FUNCTION__ , __FILE__ , __LINE__) ); - FLOG( computation2Counter.tic() ); - -#pragma omp parallel - { - KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]); - // There is a maximum of 26 neighbors - ContainerClass* neighbors[27]; - MortonIndex indexesNeighbors[27]; - int indexArray[26]; - // Box limite - const int nbLeafToProceed = leafsNeedOtherData.getSize(); - -#pragma omp for - for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){ - LeafData currentIter = leafsNeedOtherData[idxLeafs]; - - // need the current particles and neighbors particles - int counter = 0; - memset( neighbors, 0, sizeof(ContainerClass*) * 27); - - // Take possible data - const int nbNeigh = currentIter.coord.getNeighborsIndexes(OctreeHeight, indexesNeighbors, indexArray); - - for(int idxNeigh = 0 ; idxNeigh < nbNeigh ; ++idxNeigh){ - if(indexesNeighbors[idxNeigh] < (intervals[idProcess].min) || (intervals[idProcess].max) < indexesNeighbors[idxNeigh]){ - ContainerClass*const hypotheticNeighbor = otherP2Ptree.getLeafSrc(indexesNeighbors[idxNeigh]); - if(hypotheticNeighbor){ - neighbors[ indexArray[idxNeigh] ] = hypotheticNeighbor; - ++counter; + + ////////////////////////////////////////////////////////// + // Computation P2P that need others data + ////////////////////////////////////////////////////////// + + FTRACE( FTrace::FRegion regionOtherTrace("Compute P2P Other", __FUNCTION__ , __FILE__ , __LINE__) ); + //FLOG( computation2Counter.tic() ); + //FLOG(sharedLogger->tic("ParallelSection")); + //#pragma omp parallel + { + /*KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);*/ + // There is a maximum of 26 neighbors + /*ContainerClass* neighbors[27];*/ + MortonIndex indexesNeighbors[27]; + int indexArray[26]; + // Box limite + const int nbLeafToProceed = leafsNeedOtherData->getSize(); + +#pragma omp for schedule(auto) nowait + for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){ + LeafData currentIter = (*leafsNeedOtherData)[idxLeafs]; + + // need the current particles and neighbors particles + int counter = 0; + memset( neighbors, 0, sizeof(ContainerClass*) * 27); + + // Take possible data + const int nbNeigh = currentIter.coord.getNeighborsIndexes(OctreeHeight, indexesNeighbors, indexArray); + + for(int idxNeigh = 0 ; idxNeigh < nbNeigh ; ++idxNeigh){ + if(indexesNeighbors[idxNeigh] < (intervals[idProcess].min) || (intervals[idProcess].max) < indexesNeighbors[idxNeigh]){ + ContainerClass*const hypotheticNeighbor = otherP2Ptree->getLeafSrc(indexesNeighbors[idxNeigh]); + if(hypotheticNeighbor){ + neighbors[ indexArray[idxNeigh] ] = hypotheticNeighbor; + ++counter; + } } } - } - - myThreadkernels.P2PRemote( currentIter.cell->getCoordinate(), currentIter.targets, - currentIter.sources, neighbors, counter); + + myThreadkernels.P2PRemote( currentIter.cell->getCoordinate(), currentIter.targets, + currentIter.sources, neighbors, counter); + }//End For + } - - } - + }//End parallel section + //FLOG(sharedLogger->tac("ParallelSection")); + for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ delete sendBuffer[idxProc]; delete recvBuffer[idxProc]; } delete[] globalReceiveMap; delete[] leafsDataArray; - + FLOG(computation2Counter.tac()); - - + + FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = " << counterTime.tacAndElapsed() << " s)\n" ); FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Computation P2P 2 : " << computation2Counter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" ); FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" ); - + } - + }; @@ -1308,5 +1384,3 @@ private: #endif //FFMMALGORITHMTHREAD_HPP - -