FFmmAlgorithmThreadProcPeriodic.hpp 102 KB
Newer Older
1
// ===================================================================================
2 3 4 5 6
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
7 8
// abiding by the rules of distribution of free software.
//
9 10 11 12
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
13
// "http://www.cecill.info".
14
// "http://www.gnu.org/licenses".
15
// ===================================================================================
16 17
#ifndef FFMMALGORITHMTHREADPROCPPERIODIC_HPP
#define FFMMALGORITHMTHREADPROCPPERIODIC_HPP
18

19

20
#include "../Utils/FAssert.hpp"
21
#include "../Utils/FLog.hpp"
22

23 24
#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"
25
#include "../Utils/FMemUtils.hpp"
26 27 28 29 30

#include "../Containers/FBoolArray.hpp"
#include "../Containers/FOctree.hpp"
#include "../Containers/FLightOctree.hpp"

31 32
#include "../Containers/FBufferWriter.hpp"
#include "../Containers/FBufferReader.hpp"
33 34
#include "../Containers/FMpiBufferWriter.hpp"
#include "../Containers/FMpiBufferReader.hpp"
35

36 37 38 39
#include "../Utils/FMpi.hpp"

#include <omp.h>

40
#include "FCoreCommon.hpp"
41
#include "FP2PExclusion.hpp"
42

43 44
#include <memory>

45
/**
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 * @author Berenger Bramas (berenger.bramas@inria.fr)
 * @class FFmmAlgorithmThreadProcPeriodic
 * @brief
 * Please read the license
 *
 * This class is a threaded FMM algorithm with mpi.
 * It just iterates on a tree and call the kernels with good arguments.
 * It used the inspector-executor model :
 * iterates on the tree and builds an array to work in parallel on this array
 *
 * Of course this class does not deallocate pointer given in arguements.
 *
 * Threaded & based on the inspector-executor model
 * schedule(runtime) export OMP_NUM_THREADS=2
 * export OMPI_CXX=`which g++-4.4`
 * mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 * --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
 * ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
 */
65
template<class FReal, class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
66
class FFmmAlgorithmThreadProcPeriodic : public FAbstractAlgorithm {
67
    OctreeClass* const tree;                 //< The octree to work on
68
    KernelClass** kernels;                   //< The kernels
69 70

    const FMpi::FComm& comm;                 //< MPI comm
71

72 73 74
    CellClass rootCellFromProc;     //< root of tree needed by the periodicity
    const int nbLevelsAboveRoot;    //< The nb of level the user ask to go above the tree (>= -1)
    const int offsetRealTree;       //< nbLevelsAboveRoot GetFackLevel
75 76

    typename OctreeClass::Iterator* iterArray;
77
    typename OctreeClass::Iterator* iterArrayComm;  //Will be used to store pointers to cells/leafs to send/rcv
78 79 80 81 82 83 84 85 86
    int numberOfLeafs;                          //< To store the size at the previous level

    const int MaxThreads;               //< the max number of thread allowed by openmp

    const int nbProcess;                //< Number of process
    const int idProcess;                //< Id of current process

    const int OctreeHeight;

87
    const int leafLevelSeparationCriteria;
88

89
public:
90
    struct Interval{
91 92
        MortonIndex leftIndex;
        MortonIndex rightIndex;
93 94
    };

95
    Interval& getWorkingInterval(const int level, const int proc)const {
96
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
97
    }
98 99 100
private:
    Interval*const intervals;
    Interval*const workingIntervalsPerLevel;
101 102 103

    /** To know if a proc has work at a given level (if it hold cells and was responsible of them) */
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
104
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
105 106
    }

107 108
    /** Return true if the idxProc left cell at idxLevel+1 has the same parent as us for our right cell */
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
109
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
110 111 112 113
    }

    /** Return true if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
114
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
115
    }
116

117 118 119
public:

    void setKernel(KernelClass*const inKernels){
120
        this->kernels = new KernelClass*[MaxThreads];
121 122
        #pragma omp parallel num_threads(MaxThreads)
        {
123
            #pragma omp critical (InitFFmmAlgorithmThreadProcPeriodic)
124
            {
125
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
126
            }
127
        }
128 129
    }

130

131
    Interval& getWorkingInterval(const int level){
132
        return getWorkingInterval(level, idProcess);
133 134 135
    }

    bool hasWorkAtLevel(const int level){
136
        return idProcess == 0 || getWorkingInterval(level, idProcess - 1).rightIndex < getWorkingInterval(level, idProcess).rightIndex;
137 138 139
    }

    /** The constructor need the octree and the kernels used for computation
140 141 142 143
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
     * An assert is launched if one of the arguments is null
     */
144
    FFmmAlgorithmThreadProcPeriodic(const FMpi::FComm& inComm, OctreeClass* const inTree,
145
                                    const int inUpperLevel = 2, const int inLeafLevelSeperationCriteria = 1)
146
        : tree(inTree) , kernels(nullptr), comm(inComm), nbLevelsAboveRoot(inUpperLevel), offsetRealTree(inUpperLevel + 2),
147 148
          numberOfLeafs(0),
          MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
149
          OctreeHeight(tree->getHeight()),
150
          leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
151 152
          intervals(new Interval[inComm.processCount()]),
          workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
153 154 155

        FAssertLF(tree, "tree cannot be null");
        FAssertLF(-1 <= inUpperLevel, "inUpperLevel cannot be < -1");
156
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
157

158 159
        FAbstractAlgorithm::setNbLevelsInTree(extendedTreeHeight());

160 161
        FLOG(FLog::Controller << "FFmmAlgorithmThreadProcPeriodic\n");
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
162 163 164 165
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProcPeriodic(){
166 167 168 169
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
170

171 172
        delete [] intervals;
        delete [] workingIntervalsPerLevel;
173 174
    }

175 176 177 178 179 180 181 182 183



    long long int theoricalRepetition() const {
        if( nbLevelsAboveRoot == -1 ){
            // we know it is 3 (-1;+1)
            return 3;
        }
        // Else we find the repetition in one dir and double it
184
        return 6 * (1<<(nbLevelsAboveRoot));
185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
    }


    void repetitionsIntervals(FTreeCoordinate*const min, FTreeCoordinate*const max) const {
        if( nbLevelsAboveRoot == -1 ){
            // We know it is (-1;1)
            min->setPosition(-1,-1,-1);
            max->setPosition(1,1,1);
        }
        else{
            const int halfRepeated = int(theoricalRepetition()/2);
            min->setPosition(-halfRepeated,-halfRepeated,-halfRepeated);
            // if we repeat the box 8 times, we go from [-4 to 3]
            max->setPosition(halfRepeated-1,halfRepeated-1,halfRepeated-1);
        }
    }


    FReal extendedBoxWidth() const {
        // The simulation box is repeated is repeated 4 times if nbLevelsAboveRoot==-1
        // And then it doubles by two
        return tree->getBoxWidth() * FReal(1<<(nbLevelsAboveRoot+3));
    }

    /** This function has to be used to init the kernel with correct args
     * it return the box cneter seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalBoxWidth and originalBoxCenter given in parameter
     * @param originalBoxCenter the real system center
     * @param originalBoxWidth the real system size
     * @return the center the kernel should use
     */
216
    FPoint<FReal> extendedBoxCenter() const {
217 218
        const FReal originalBoxWidth     = tree->getBoxWidth();
        const FReal originalBoxWidthDiv2 = originalBoxWidth/2.0;
219
        const FPoint<FReal> originalBoxCenter   = tree->getBoxCenter();
220 221

        const FReal offset = extendedBoxWidth()/FReal(2.0);
222
        return FPoint<FReal>( originalBoxCenter.getX() - originalBoxWidthDiv2 + offset,
223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239
                       originalBoxCenter.getY() - originalBoxWidthDiv2 + offset,
                       originalBoxCenter.getZ() - originalBoxWidthDiv2 + offset);
    }

    /** This function has to be used to init the kernel with correct args
     * it return the tree heigh seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalTreeHeight given in parameter
     * @param originalTreeHeight the real tree heigh
     * @return the heigh the kernel should use
     */
    int extendedTreeHeight() const {
        // The real height
        return OctreeHeight + offsetRealTree;
    }


protected:
240
    /**
241 242 243
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
244
    void executeCore(const unsigned operationsToProceed) override {
245 246 247
        // Count leaf
        this->numberOfLeafs = 0;
        {
248 249
            Interval myFullInterval;
            {//Building the interval with the first and last leaves (and count the number of leaves)
250 251
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
252
                myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
253 254 255
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
256
                myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
257
            }
258 259 260 261 262 263 264 265
            // Allocate a number to store the pointer of the cells at a level
            iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
            iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
            FAssertLF(iterArray,     "iterArray     bad alloc");
            FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()),  __LINE__ );
266

267 268 269 270
            // Build my intervals for all levels
            std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
            // At leaf level we know it is the full interval
            myIntervals[OctreeHeight - 1] = myFullInterval;
271

272
            // We can estimate the interval for each level by using the parent/child relation
273 274 275 276
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
            }
277 278 279

            // Process 0 uses the estimates as real intervals, but other processes
            // should remove cells that belong to others
280
            if(idProcess != 0){
281
                //We test for each level if process on left (idProcess-1) own cell I thought I owned
282 283 284 285
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();

286 287 288 289 290
                // At h-1 the working limit is the parent of the right cell of the proc on the left
                MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;

                // We check if we have no more work to do
                int nullIntervalFromLevel = 0;
291

292 293 294 295 296 297 298
                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                        if( !octreeIterator.moveRight() ){
                            // We cannot move right we are not owner of any more cell
                            nullIntervalFromLevel = idxLevel;
                            break;
                        }
299
                    }
300 301 302 303 304 305 306 307 308 309
                    // If we are responsible for some cells at this level keep the first index
                    if(nullIntervalFromLevel == 0){
                        myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                        octreeIterator.moveUp();
                        workingLimitAtLevel >>= 3;
                    }
                }
                // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
310 311 312
                }
            }

313 314
            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
315 316 317 318 319 320 321 322 323 324 325 326
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()),  __LINE__ );
        }

        // run;
        if(operationsToProceed & FFmmP2M) bottomPass();

        if(operationsToProceed & FFmmM2M) upwardPass();

        if(operationsToProceed & FFmmM2L) transferPass();

        if(operationsToProceed & FFmmL2L) downardPass();

327
        if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
328

329

330
        // delete array
331 332 333 334
        delete []     iterArray;
        delete [] iterArrayComm;
        iterArray     = nullptr;
        iterArrayComm = nullptr;
335 336 337 338 339 340
    }

    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

341 342 343 344 345
    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
346
    void bottomPass(){
347 348
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
349
        FLOG(FTic computationCounter);
350
        typename OctreeClass::Iterator octreeIterator(tree);
351

352
        // Copy the ptr to leaves in array
353 354 355 356 357
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());
358

359
        FLOG(computationCounter.tic());
360
#pragma omp parallel num_threads(MaxThreads)
361
        {
362
            // Each thread get its own kernel
363
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
364
            // Parallel iteration on the leaves
365
#pragma omp for nowait
366 367 368 369 370
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FLOG(computationCounter.tac());
371
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
372
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
373 374 375 376 377
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////
378

379 380
    /** M2M */
    void upwardPass(){
381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

401 402 403
        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

404
        FSize bufferSize;
405 406
        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
407
        FSize recvBufferSize[7];
408
        CellClass recvBufferCells[7];
409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

        // We work from height-1 to 1
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 0 ; --idxLevel ){
            const int fackLevel = idxLevel + offsetRealTree;
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

433 434
            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
459
            #pragma omp parallel num_threads(MaxThreads)
460
            {
461
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
462
                //This single section post and receive the comms, and then do the M2M associated with it.
463
                #pragma omp single nowait
464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
493
                            bufferSize = sendBuffer.getSize();
494
                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
495
                                      FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
496 497
                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo,
498 499 500 501 502 503 504 505 506 507 508 509 510 511
                                      FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
512
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536
                                        idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
537 538
                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED,
539 540 541 542 543 544 545 546 547 548 549 550 551 552
                                        idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
                    }
553

554 555 556 557 558 559 560 561
                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
562
                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());
563 564

                            int position = 0;
565
                            int positionToInsert = 0;
566 567 568 569 570
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
571 572
                                FAssertLF(positionToInsert < 7);
                                FAssertLF(position < 8);
573
                                FAssertLF(!currentChild[position], "Already has a cell here");
574 575
                                recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
                                currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
576 577

                                packageFlags >>= 1;
578 579
                                position += 1;
                                positionToInsert += 1;
580
                            }
581 582

                            recvBuffer[idxProc].seek(0);
583 584
                        }
                        // Finally compute
585
                        myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, fackLevel);
586 587 588 589 590 591 592 593
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
594
                #pragma omp for nowait
595 596 597 598 599 600
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
                    myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), fackLevel);
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }
601

602 603 604 605
        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
606 607
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
608 609


610 611 612
        //////////////////////////////////////////////////////////////////
        //Periodicity
        //////////////////////////////////////////////////////////////////
613

614
        octreeIterator = typename OctreeClass::Iterator(tree);
615

616
        if( idProcess == 0){
617 618 619 620
            int iterRequestsSize = 0;

            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
                if( procHasWorkAtLevel(1,idxProc) ){
621
                    MPI_Irecv(&recvBufferSize[iterRequestsSize], 1, FMpi::GetType(recvBufferSize[iterRequestsSize]), idxProc,
622 623 624 625 626 627 628
                            FMpi::TagFmmM2MSize, comm.getComm(), &requests[iterRequestsSize]);
                    iterRequestsSize += 1;
                    FAssertLF(iterRequestsSize <= 7);
                }
            }
            MPI_Waitall( iterRequestsSize, requests, MPI_STATUSES_IGNORE);

629
            int iterRequests = 0;
630

631 632
            CellClass* currentChild[8];
            memcpy(currentChild, octreeIterator.getCurrentBox(), 8 * sizeof(CellClass*));
633

634
            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
635
                if( procHasWorkAtLevel(1,idxProc) ){
636
                    recvBuffer[iterRequests].cleanAndResize(recvBufferSize[iterRequests]);
637 638
                    FAssertLF(recvBufferSize[iterRequests] < std::numeric_limits<int>::max());
                    MPI_Irecv(recvBuffer[iterRequests].data(), int(recvBufferSize[iterRequests]), MPI_BYTE, idxProc,
639 640 641
                            FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests]);
                    iterRequests += 1;
                    FAssertLF(iterRequests <= 7);
642 643
                }
            }
644

645
            MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE);
646

647
            // retreive data and merge my child and the child from others
648
            int counterProc = 0;
649
            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
650
                if( procHasWorkAtLevel(1,idxProc) ){
651 652
                    recvBuffer[counterProc].seek(0);
                    int state = int(recvBuffer[counterProc].getValue<char>());
653

654 655 656 657 658 659 660
                    int position = 0;
                    while( state && position < 8){
                        while(!(state & 0x1)){
                            state >>= 1;
                            ++position;
                        }
                        FAssertLF(!currentChild[position], "Already has a cell here");
661

662
                        recvBufferCells[position].deserializeUp(recvBuffer[counterProc]);
663

664
                        currentChild[position] = (CellClass*) &recvBufferCells[position];
665

666 667 668
                        state >>= 1;
                        ++position;
                    }
669 670 671

                    FAssertLF(recvBuffer[counterProc].tell() == recvBufferSize[counterProc]);
                    counterProc += 1;
672 673
                }
            }
674

675
            (*kernels[0]).M2M( &rootCellFromProc , currentChild, offsetRealTree);
676

677 678 679
            processPeriodicLevels();
        }
        else {
680 681 682
            if( hasWorkAtLevel(1) ){
                const int firstChild = getWorkingInterval(1, idProcess).leftIndex & 7;
                const int lastChild  = getWorkingInterval(1, idProcess).rightIndex & 7;
683

684
                CellClass** child = octreeIterator.getCurrentBox();
685

686 687
                char state = 0;
                sendBuffer.write(state);
688

689 690 691 692 693 694 695
                for(int idxChild = firstChild ; idxChild <= lastChild ; ++idxChild){
                    if( child[idxChild] ){
                        child[idxChild]->serializeUp(sendBuffer);
                        state = char( state | (0x1 << idxChild));
                    }
                }
                sendBuffer.writeAt(0,state);
696 697 698
                FSize sizeToSend = sendBuffer.getSize();
                MPI_Send(&sizeToSend, 1, FMpi::GetType(sizeToSend), 0, FMpi::TagFmmM2MSize, comm.getComm());
                MPI_Send(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, FMpi::TagFmmM2M, comm.getComm());
699 700
            }
        }
701 702 703 704 705 706
    }

    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////

707

708
    void transferPass(){
709 710 711 712 713 714 715 716 717 718 719 720 721
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
722
        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
723
        // index
724 725
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
726 727 728 729 730 731
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
732 733
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
734 735 736 737 738 739

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
740

741
        #pragma omp parallel num_threads(MaxThreads)
742
        {
743
            #pragma omp master
744 745 746 747 748 749 750 751 752 753 754 755 756 757 758
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    ////octreeIterator.moveDown();
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
                    for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
759

760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783
                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
784 785
                        int neighborsPosition[/*189+26+1*/216];
                        MortonIndex neighborsIndexes[/*189+26+1*/216];
786 787 788 789
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
                            const int counter = getPeriodicInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(),
                                                                               idxLevel,
790
                                                                               neighborsIndexes, neighborsPosition, AllDirs, leafLevelSeparationCriteria);
791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
815 816 817 818 819
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
820
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
839
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()),  __LINE__ );
840 841 842 843 844 845 846 847 848 849 850 851 852 853 854
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
                MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight];
                MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight];
                int iterRequest = 0;

                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
855
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
856
                        if(toSendAtProcAtLevel != 0){
857
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));
858

859 860 861
                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
862 863
                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
864 865 866 867 868
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

869 870
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

871
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max());
872
                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
873
                                             int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc,
874 875 876
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }

877
                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
878
                        if(toReceiveFromProcAtLevel){
879
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));
880

881
                            FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
882
                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
883
                                             int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc,
884 885 886 887 888 889 890 891 892 893
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
894
                FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__);
895 896 897 898 899

                delete[] requests;
                delete[] status;

                FLOG(sendCounter.tac());
900
            }//End of Master region
901 902 903 904

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////
905

906
            #pragma omp single nowait
907 908 909 910 911 912 913 914
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                ////octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    const int fackLevel = idxLevel + offsetRealTree;
915

916
                    const int separationCriteria = (idxLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);
917

918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939
                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
                        const int chunckSize = FMath::Max(1, numberOfCells/(omp_get_num_threads()*omp_get_num_threads()));
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
940
                            #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
941 942
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
943 944
                                const CellClass* neighbors[342];
                                int neighborPositions[342];
945 946 947

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
948
                                    const int counter = tree->getPeriodicInteractionNeighbors(neighbors, neighborPositions,
949
                                                                            iterArray[idxCellToCompute].getCurrentGlobalCoordinate(),
950
                                                                            idxLevel, AllDirs, separationCriteria);
951 952

                                    if(counter)
953
                                        myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, neighborPositions, counter, fackLevel);
954 955 956
                                }
                            }
                        }
957
                    }//End of task spawning
958

959
                    #pragma omp taskwait
960

961
                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
962
                        #pragma omp task  default(none) firstprivate(idxThread,idxLevel)
963 964 965 966
                        {
                            kernels[idxThread]->finishedLevelM2L(fackLevel);
                        }
                    }
967
                    #pragma omp taskwait
968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983

                    FLOG(computationCounter.tac());
                }
            }
        }


        {
            FLOG(receiveCounter.tic());
            typename OctreeClass::Iterator octreeIterator(tree);
            ////octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // compute the second time
            // for each levels
            for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                const int fackLevel = idxLevel + offsetRealTree;
984

985
                const int separationCriteria = (fackLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);
986

987 988 989 990 991 992 993 994 995
                if(!procHasWorkAtLevel(idxLevel, idProcess)){
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;
                    continue;
                }

                // put the received data into a temporary tree
                FLightOctree<CellClass> tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
996
                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
997
                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();
998

999
                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
1000 1001 1002 1003
                            const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
                            const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
                            FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);

1004
                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();
1005

1006 1007 1008 1009 1010 1011
                            CellClass* const newCell = new CellClass;
                            newCell->setMortonIndex(cellIndex);
                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);

                            tempTree.insertCell(cellIndex, idxLevel, newCell);
                        }
1012

1013 1014
                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
                int numberOfCells = 0;
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = nullptr;

                // Compute this cells
                FLOG(computationCounter.tic());
1041
                #pragma omp parallel num_threads(MaxThreads)
1042 1043
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
1044 1045
                    MortonIndex neighborsIndex[/*189+26+1*/216];
                    int neighborsPosition[/*189+26+1*/216];
1046 1047
                    const CellClass* neighbors[342];
                    int neighborPositions[342];
1048

1049
                    #pragma omp for schedule(static) nowait
1050
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
1051
                        const int counterNeighbors = getPeriodicInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel, neighborsIndex, neighborsPosition, AllDirs, separationCriteria);
1052 1053 1054 1055 1056 1057 1058 1059 1060 1061
                        int counter = 0;
                        // does we receive this index from someone?
                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
                            if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
                                    || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){

                                CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);

                                if(otherCell){
                                    //otherCell->setMortonIndex(neighborsIndex[idxNeig]);
1062 1063
                                    neighbors[counter] = otherCell;
                                    neighborPositions[counter] = neighborsPosition[idxNeig] ;
1064 1065 1066 1067 1068 1069
                                    ++counter;
                                }
                            }
                        }
                        // need to compute
                        if(counter){
1070
                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, neighborPositions, counter, fackLevel);
1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100
                        }
                    }

                    myThreadkernels->finishedLevelM2L(fackLevel);
                }
                FLOG(computationCounter.tac());
            }
            FLOG(receiveCounter.tac());
        }

        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
            delete sendBuffer[idxComm];
            delete recvBuffer[idxComm];
        }
        for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
            delete leafsNeedOther[idxComm];
        }
        delete[] sendBuffer;
        delete[] recvBuffer;
        delete[] indexToSend;
        delete[] leafsNeedOther;
        delete[] globalReceiveMap;


        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
1101

1102
    }
1103

1104
    //////////////////////////////////////////////////////////////////
1105 1106
    // ---------------- L2L ---------------
    //////////////////////////////////////////////////////////////////
1107

1108
    void downardPass(){ // second L2L
1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122
        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic waitCounter);

        // Start from leal level - 1
        typename OctreeClass::Iterator octreeIterator(tree);
        ////octreeIterator.moveDown();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // Max 1 receive and 7 send (but 7 times the same data)
        MPI_Request*const requests = new MPI_Request[8];
        MPI_Status*const status = new MPI_Status[8];
1123 1124
        MPI_Request*const requestsSize = new MPI_Request[8];
        MPI_Status*const statusSize = new MPI_Status[8];
1125

1126 1127
        FMpiBufferWriter sendBuffer(comm.getComm());
        FMpiBufferReader recvBuffer(comm.getComm());
1128 1129 1130 1131 1132 1133

        int righestProcToSendTo   = nbProcess - 1;

        // Periodic
        if( idProcess == 0){
            rootCellFromProc.serializeDown(sendBuffer);
1134 1135
            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
            int sizeOfSerialization = int(sendBuffer.getSize());
1136
            FMpi::MpiAssert( MPI_Bcast( &sizeOfSerialization, 1, MPI_INT, 0, comm.getComm() ), __LINE__ );
1137
            FMpi::MpiAssert( MPI_Bcast( sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, comm.getComm() ), __LINE__ );
1138 1139 1140 1141 1142
            sendBuffer.reset();
        }
        else{
            int sizeOfSerialization = -1;
            FMpi::MpiAssert( MPI_Bcast( &sizeOfSerialization, 1, MPI_INT, 0, comm.getComm() ), __LINE__ );
1143
            recvBuffer.cleanAndResize(sizeOfSerialization);
1144 1145
            FMpi::MpiAssert( MPI_Bcast( recvBuffer.data(), sizeOfSerialization, MPI_BYTE, 0, comm.getComm() ), __LINE__ );
            rootCellFromProc.deserializeDown(recvBuffer);
1146
            FAssertLF(recvBuffer.getSize() == sizeOfSerialization);
1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188
            recvBuffer.seek(0);
        }
        kernels[0]->L2L(&rootCellFromProc, octreeIterator.getCurrentBox(), offsetRealTree);


        // for each levels exepted leaf level
        for(int idxLevel = 1 ; idxLevel < OctreeHeight - 1 ; ++idxLevel ){
            const int fackLevel = idxLevel + offsetRealTree;
            // If nothing to do in the next level skip the current one
            if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;
                continue;
            }

            // Copy all the cells in an array even the one that are out of my working interval
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveDown();
            octreeIterator = avoidGotoLeftIterator;

            // Count the number of cells that are out of my working interval
            int nbCellsToSkip = 0;
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
                nbCellsToSkip += 1;
            }

            // Check if someone will send a cell to me
            bool hasToReceive = false;
            int idxProcToReceive = idProcess - 1;
            if(idProcess != 0 && nbCellsToSkip){
                // Starting from my left neighbor stop at the first proc that has work to do (not null interval)
                while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
                    idxProcToReceive -= 1;
                }
                // Check if we find such a proc and that it share a cell with us on the border
                if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
                    hasToReceive = true;
                }
            }
1189

1190
            #pragma omp parallel num_threads(MaxThreads)
1191
            {
1192 1193
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                #pragma omp single nowait
1194 1195 1196
                {
                    FLOG(prepareCounter.tic());
                    int iterRequests = 0;
1197
                    int iterRequestsSize = 0;
1198 1199
                    FSize recvBufferSize = 0;
                    FSize sendBufferSize;
1200 1201
                    // Post the receive
                    if(hasToReceive){
1202
                        FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive,
1203
                                                    FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218
                    }

                    // We have to be sure that we are not sending if we have no work in the current level
                    if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
                        int idxProcSend = idProcess + 1;
                        int nbMessageSent = 0;
                        // From the proc on the right to righestProcToSendTo, check if we have to send something
                        while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
                            // We know that if the proc has work at the next level it share a cell with us due to the while condition
                            if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
                                FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
                                // If first message then serialize the cell to send
                                if( nbMessageSent == 0 ){
                                    // We send our last cell
                                    iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
1219
                                    sendBufferSize = sendBuffer.getSize();
1220 1221
                                }
                                // Post the send message
1222
                                FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
1223
                                                           FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
1224 1225
                                FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, idxProcSend,
1226 1227 1228 1229 1230 1231 1232 1233 1234 1235
                                                           FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
                                // Inc and check the counter
                                nbMessageSent += 1;
                                FAssertLF(nbMessageSent <= 7);
                            }
                            idxProcSend += 1;
                        }
                        // Next time we will not need to go further than idxProcSend
                        righestProcToSendTo = idxProcSend;
                    }
1236

1237
                    // Finalize the communication
1238 1239 1240 1241 1242 1243 1244 1245
                    if(iterRequestsSize){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequestsSize <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, statusSize), __LINE__);
                        FLOG(waitCounter.tac());
                    }

                    if(hasToReceive){
1246 1247 1248
                        recvBuffer.cleanAndResize(recvBufferSize);
                        FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_PACKED, idxProcToReceive,
1249 1250 1251
                                                    FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
                    }

1252 1253 1254 1255 1256 1257
                    if(iterRequests){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequests <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, status), __LINE__);
                        FLOG(waitCounter.tac());
                    }
1258

1259 1260 1261 1262 1263 1264
                    // If we receive something proceed the L2L
                    if(hasToReceive){
                        FAssertLF(iterRequests != 0);
                        // In this case we know that we have to perform the L2L with the last cell that are
                        // exclude from our working interval nbCellsToSkip-1
                        iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
1265
                        myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), fackLevel);
1266 1267 1268
                    }
                    FLOG(prepareCounter.tac());
                }
1269

1270
                #pragma omp single nowait
1271 1272 1273 1274
                {
                    FLOG(computationCounter.tic());
                }
                // Threads are working on all the cell of our working interval at that level
1275
                #pragma omp for nowait
1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292
                for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), fackLevel);
                }
            }
            FLOG(computationCounter.tac());

            sendBuffer.reset();
            recvBuffer.seek(0);
        }

        delete[] requests;
        delete[] status;

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
1293 1294
    }

1295

1296 1297 1298
    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////
1299
    struct LeafData{
1300 1301 1302 1303
        FTreeCoordinate coord;
        CellClass* cell;
        ContainerClass* targets;
        ContainerClass* sources;
1304
    };
1305 1306


1307
    /** P2P */
1308
    void directPass(const bool p2pEnabled, const bool l2pEnabled){
1309 1310 1311 1312 1313 1314
        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
        FLOG( FTic counterTime);
        FLOG( FTic prepareCounter);
        FLOG( FTic gatherCounter);
        FLOG( FTic waitCounter);
        FLOG(FTic computationCounter);
1315
        FLOG(FTic computation2Counter);
1316 1317 1318 1319 1320 1321 1322 1323 1324 1325 1326 1327 1328 1329 1330 1331 1332 1333 1334 1335 1336 1337

        ///////////////////////////////////////////////////
        // Prepare data to send receive
        ///////////////////////////////////////////////////
        FLOG(prepareCounter.tic());

        // To send in asynchrone way
        MPI_Request requests[2 * nbProcess];
        MPI_Status status[2 * nbProcess];
        int iterRequest = 0;
        int nbMessagesToRecv = 0;

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);

        /* This a nbProcess x nbProcess matrix of integer
     * let U and V be id of processes :
     * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
     */
1338 1339
        FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
        memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
1340

1341 1342
        FBoolArray leafsNeedOther(this->numberOfLeafs);
        int countNeedOther = 0;
1343

1344 1345
        // To store the result
        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
1346

1347 1348
        // init
        const int LeafIndex = OctreeHeight - 1;
1349
        const int SizeShape = P2PExclusionClass::SizeShape;
1350

1351 1352
        int shapeLeaf[SizeShape];
        memset(shapeLeaf,0,SizeShape*sizeof(int));
1353

1354
        LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];
1355

1356
        FVector<LeafData> leafsNeedOtherData(countNeedOther);
1357

1358
        FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
1359 1360
        FSize partsToSend[nbProcess];
        memset(partsToSend, 0, sizeof(FSize) * nbProcess);
1361

1362
#pragma omp parallel num_threads(MaxThreads)
1363
        {
1364
#pragma omp master // MUST WAIT to fill leafsNeedOther
1365
            if(p2pEnabled){
1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 1415 1416 1417 1418 1419 1420 1421 1422
                // Copy leafs
                {
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    int idxLeaf = 0;
                    do{
                        this->iterArray[idxLeaf++] = octreeIterator;
                    } while(octreeIterator.moveRight());
                }
                const int limite = 1 << (this->OctreeHeight - 1);
                int alreadySent[nbProcess];

                //Will store the indexes of the neighbors of current cell
                MortonIndex indexesNeighbors[26];
                int uselessIndexArray[26];

                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    memset(alreadySent, 0, sizeof(int) * nbProcess);
                    bool needOther = fals