FFmmAlgorithmThreadProcPeriodic.hpp

// ===================================================================================
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
// abiding by the rules of distribution of free software.
//
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
// "http://www.cecill.info".
// "http://www.gnu.org/licenses".
// ===================================================================================
#ifndef FFMMALGORITHMTHREADPROCPPERIODIC_HPP
#define FFMMALGORITHMTHREADPROCPPERIODIC_HPP


#include "../Utils/FAssert.hpp"
#include "../Utils/FLog.hpp"

#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"
#include "../Utils/FMemUtils.hpp"

#include "../Containers/FBoolArray.hpp"
#include "../Containers/FOctree.hpp"
#include "../Containers/FLightOctree.hpp"

#include "../Containers/FBufferWriter.hpp"
#include "../Containers/FBufferReader.hpp"
#include "../Containers/FMpiBufferWriter.hpp"
#include "../Containers/FMpiBufferReader.hpp"

#include "../Utils/FMpi.hpp"

#include <omp.h>

#include "FCoreCommon.hpp"
#include "FP2PExclusion.hpp"

#include <memory>

/**
 * @author Berenger Bramas (berenger.bramas@inria.fr)
 * @class FFmmAlgorithmThreadProcPeriodic
 * @brief
 * Please read the license
 *
 * This class is a threaded FMM algorithm with mpi.
 * It just iterates on a tree and call the kernels with good arguments.
 * It used the inspector-executor model :
 * iterates on the tree and builds an array to work in parallel on this array
 *
 * Of course this class does not deallocate pointer given in arguements.
 *
 * Threaded & based on the inspector-executor model
 * schedule(runtime) export OMP_NUM_THREADS=2
 * export OMPI_CXX=`which g++-4.4`
 * mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 * --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
 * ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
 */
template<class FReal, class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
class FFmmAlgorithmThreadProcPeriodic : public FAbstractAlgorithm {
    OctreeClass* const tree;                 //< The octree to work on
    KernelClass** kernels;                   //< The kernels

    const FMpi::FComm& comm;                 //< MPI comm

    CellClass rootCellFromProc;     //< root of tree needed by the periodicity
    const int nbLevelsAboveRoot;    //< The nb of level the user ask to go above the tree (>= -1)
    const int offsetRealTree;       //< nbLevelsAboveRoot GetFackLevel

    typename OctreeClass::Iterator* iterArray;
    typename OctreeClass::Iterator* iterArrayComm;  //Will be used to store pointers to cells/leafs to send/rcv
    int numberOfLeafs;                          //< To store the size at the previous level

    const int MaxThreads;               //< the max number of thread allowed by openmp

    const int nbProcess;                //< Number of process
    const int idProcess;                //< Id of current process

    const int OctreeHeight;

    const int leafLevelSeparationCriteria;

public:
    struct Interval{
        MortonIndex leftIndex;
        MortonIndex rightIndex;
    };

    Interval& getWorkingInterval(const int level, const int proc)const {
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
    }
private:
    Interval*const intervals;
    Interval*const workingIntervalsPerLevel;

    /** To know if a proc has work at a given level (if it hold cells and was responsible of them) */
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
    }

    /** Return true if the idxProc left cell at idxLevel+1 has the same parent as us for our right cell */
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
    }

    /** Return true if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
    }

public:

    void setKernel(KernelClass*const inKernels){
        this->kernels = new KernelClass*[MaxThreads];
        #pragma omp parallel num_threads(MaxThreads)
        {
            #pragma omp critical (InitFFmmAlgorithmThreadProcPeriodic)
            {
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
            }
        }
    }


    Interval& getWorkingInterval(const int level){
        return getWorkingInterval(level, idProcess);
    }

    bool hasWorkAtLevel(const int level){
        return idProcess == 0 || getWorkingInterval(level, idProcess - 1).rightIndex < getWorkingInterval(level, idProcess).rightIndex;
    }

    /** The constructor need the octree and the kernels used for computation
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
     * An assert is launched if one of the arguments is null
     */
    FFmmAlgorithmThreadProcPeriodic(const FMpi::FComm& inComm, OctreeClass* const inTree,
                                    const int inUpperLevel = 2, const int inLeafLevelSeperationCriteria = 1)
        : tree(inTree) , kernels(nullptr), comm(inComm), nbLevelsAboveRoot(inUpperLevel), offsetRealTree(inUpperLevel + 2),
          numberOfLeafs(0),
          MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
          OctreeHeight(tree->getHeight()),
          leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
          intervals(new Interval[inComm.processCount()]),
          workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {

        FAssertLF(tree, "tree cannot be null");
        FAssertLF(-1 <= inUpperLevel, "inUpperLevel cannot be < -1");
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");

        FAbstractAlgorithm::setNbLevelsInTree(extendedTreeHeight());

        FLOG(FLog::Controller << "FFmmAlgorithmThreadProcPeriodic\n");
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProcPeriodic(){
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;

        delete [] intervals;
        delete [] workingIntervalsPerLevel;
    }


    long long int theoricalRepetition() const {
        if( nbLevelsAboveRoot == -1 ){
            // we know it is 3 (-1;+1)
            return 3;
        }
        // Else we find the repetition in one dir and double it
        return 6 * (1<<(nbLevelsAboveRoot));
    }


    void repetitionsIntervals(FTreeCoordinate*const min, FTreeCoordinate*const max) const {
        if( nbLevelsAboveRoot == -1 ){
            // We know it is (-1;1)
            min->setPosition(-1,-1,-1);
            max->setPosition(1,1,1);
        }
        else{
            const int halfRepeated = int(theoricalRepetition()/2);
            min->setPosition(-halfRepeated,-halfRepeated,-halfRepeated);
            // if we repeat the box 8 times, we go from [-4 to 3]
            max->setPosition(halfRepeated-1,halfRepeated-1,halfRepeated-1);
        }
    }


    FReal extendedBoxWidth() const {
        // The simulation box is repeated is repeated 4 times if nbLevelsAboveRoot==-1
        // And then it doubles by two
        return tree->getBoxWidth() * FReal(1<<(nbLevelsAboveRoot+3));
    }

    /** This function has to be used to init the kernel with correct args
     * it return the box cneter seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalBoxWidth and originalBoxCenter given in parameter
     * @param originalBoxCenter the real system center
     * @param originalBoxWidth the real system size
     * @return the center the kernel should use
     */
    FPoint<FReal> extendedBoxCenter() const {
        const FReal originalBoxWidth     = tree->getBoxWidth();
        const FReal originalBoxWidthDiv2 = originalBoxWidth/2.0;
        const FPoint<FReal> originalBoxCenter   = tree->getBoxCenter();

        const FReal offset = extendedBoxWidth()/FReal(2.0);
        return FPoint<FReal>( originalBoxCenter.getX() - originalBoxWidthDiv2 + offset,
                       originalBoxCenter.getY() - originalBoxWidthDiv2 + offset,
                       originalBoxCenter.getZ() - originalBoxWidthDiv2 + offset);
    }

    /** This function has to be used to init the kernel with correct args
     * it return the tree heigh seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalTreeHeight given in parameter
     * @param originalTreeHeight the real tree heigh
     * @return the heigh the kernel should use
     */
    int extendedTreeHeight() const {
        // The real height
        return OctreeHeight + offsetRealTree;
    }


protected:
    /**
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
    void executeCore(const unsigned operationsToProceed) override {
        // Count leaf
        this->numberOfLeafs = 0;
        {
            Interval myFullInterval;
            {//Building the interval with the first and last leaves (and count the number of leaves)
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
                myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
            }
            // Allocate a number to store the pointer of the cells at a level
            iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
            iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
            FAssertLF(iterArray,     "iterArray     bad alloc");
            FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()),  __LINE__ );

            // Build my intervals for all levels
            std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
            // At leaf level we know it is the full interval
            myIntervals[OctreeHeight - 1] = myFullInterval;

            // We can estimate the interval for each level by using the parent/child relation
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
            }

            // Process 0 uses the estimates as real intervals, but other processes
            // should remove cells that belong to others
            if(idProcess != 0){
                //We test for each level if process on left (idProcess-1) own cell I thought I owned
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();

                // At h-1 the working limit is the parent of the right cell of the proc on the left
                MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;

                // We check if we have no more work to do
                int nullIntervalFromLevel = 0;

                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                        if( !octreeIterator.moveRight() ){
                            // We cannot move right we are not owner of any more cell
                            nullIntervalFromLevel = idxLevel;
                            break;
                        }
                    }
                    // If we are responsible for some cells at this level keep the first index
                    if(nullIntervalFromLevel == 0){
                        myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                        octreeIterator.moveUp();
                        workingLimitAtLevel >>= 3;
                    }
                }
                // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
                }
            }

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()),  __LINE__ );
        }

        // run;
        if(operationsToProceed & FFmmP2M) bottomPass();

        if(operationsToProceed & FFmmM2M) upwardPass();

        if(operationsToProceed & FFmmM2L) transferPass();

        if(operationsToProceed & FFmmL2L) downardPass();

        if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));


        // delete array
        delete []     iterArray;
        delete [] iterArrayComm;
        iterArray     = nullptr;
        iterArrayComm = nullptr;
    }

    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
    void bottomPass(){
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        typename OctreeClass::Iterator octreeIterator(tree);

        // Copy the ptr to leaves in array
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());

        FLOG(computationCounter.tic());
#pragma omp parallel
        {
            // Each thread get its own kernel
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            // Parallel iteration on the leaves
#pragma omp for nowait
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

    /** M2M */
    void upwardPass(){
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

        FSize bufferSize;
        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
        FSize recvBufferSize[7];
        CellClass recvBufferCells[7];

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

        // We work from height-1 to 1
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 0 ; --idxLevel ){
            const int fackLevel = idxLevel + offsetRealTree;
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
            #pragma omp parallel
            {
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                //This single section post and receive the comms, and then do the M2M associated with it.
                #pragma omp single nowait
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
                            bufferSize = sendBuffer.getSize();
                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
                                      FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo,
                                      FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
                                        idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED,
                                        idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
                    }

                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());

                            int position = 0;
                            int positionToInsert = 0;
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
                                FAssertLF(positionToInsert < 7);
                                FAssertLF(position < 8);
                                FAssertLF(!currentChild[position], "Already has a cell here");
                                recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
                                currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];

                                packageFlags >>= 1;
                                position += 1;
                                positionToInsert += 1;
                            }

                            recvBuffer[idxProc].seek(0);
                        }
                        // Finally compute
                        myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, fackLevel);
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
                #pragma omp for nowait
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
                    myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), fackLevel);
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }

        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );


        //////////////////////////////////////////////////////////////////
        //Periodicity
        //////////////////////////////////////////////////////////////////

        octreeIterator = typename OctreeClass::Iterator(tree);

        if( idProcess == 0){
            int iterRequestsSize = 0;

            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
                if( procHasWorkAtLevel(1,idxProc) ){
                    MPI_Irecv(&recvBufferSize[iterRequestsSize], 1, FMpi::GetType(recvBufferSize[iterRequestsSize]), idxProc,
                            FMpi::TagFmmM2MSize, comm.getComm(), &requests[iterRequestsSize]);
                    iterRequestsSize += 1;
                    FAssertLF(iterRequestsSize <= 7);
                }
            }
            MPI_Waitall( iterRequestsSize, requests, MPI_STATUSES_IGNORE);

            int iterRequests = 0;
            CellClass* currentChild[8];
            memcpy(currentChild, octreeIterator.getCurrentBox(), 8 * sizeof(CellClass*));

            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
                if( procHasWorkAtLevel(1,idxProc) ){
                    recvBuffer[iterRequests].cleanAndResize(recvBufferSize[iterRequests]);
                    FAssertLF(recvBufferSize[iterRequests] < std::numeric_limits<int>::max());
                    MPI_Irecv(recvBuffer[iterRequests].data(), int(recvBufferSize[iterRequests]), MPI_BYTE, idxProc,
                            FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests]);
                    iterRequests += 1;
                    FAssertLF(iterRequests <= 7);
                }
            }

            MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE);

            // retreive data and merge my child and the child from others
            int counterProc = 0;
            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
                if( procHasWorkAtLevel(1,idxProc) ){
                    recvBuffer[counterProc].seek(0);
                    int state = int(recvBuffer[counterProc].getValue<char>());

                    int position = 0;
                    while( state && position < 8){
                        while(!(state & 0x1)){
                            state >>= 1;
                            ++position;
                        }
                        FAssertLF(!currentChild[position], "Already has a cell here");

                        recvBufferCells[position].deserializeUp(recvBuffer[counterProc]);

                        currentChild[position] = (CellClass*) &recvBufferCells[position];

                        state >>= 1;
                        ++position;
                    }

                    FAssertLF(recvBuffer[counterProc].tell() == recvBufferSize[counterProc]);
                    counterProc += 1;
                }
            }

            (*kernels[0]).M2M( &rootCellFromProc , currentChild, offsetRealTree);

            processPeriodicLevels();
        }
        else {
            if( hasWorkAtLevel(1) ){
                const int firstChild = getWorkingInterval(1, idProcess).leftIndex & 7;
                const int lastChild  = getWorkingInterval(1, idProcess).rightIndex & 7;

                CellClass** child = octreeIterator.getCurrentBox();

                char state = 0;
                sendBuffer.write(state);

                for(int idxChild = firstChild ; idxChild <= lastChild ; ++idxChild){
                    if( child[idxChild] ){
                        child[idxChild]->serializeUp(sendBuffer);
                        state = char( state | (0x1 << idxChild));
                    }
                }
                sendBuffer.writeAt(0,state);
                FSize sizeToSend = sendBuffer.getSize();
                MPI_Send(&sizeToSend, 1, FMpi::GetType(sizeToSend), 0, FMpi::TagFmmM2MSize, comm.getComm());
                MPI_Send(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, FMpi::TagFmmM2M, comm.getComm());
            }
        }
    }

    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////


    void transferPass(){
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
        // index
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);

        #pragma omp parallel
        {
            #pragma omp master
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    ////octreeIterator.moveDown();
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
                    for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){

                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
                        int neighborsPosition[/*189+26+1*/216];
                        MortonIndex neighborsIndexes[/*189+26+1*/216];
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
                            const int counter = getPeriodicInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(),
                                                                               idxLevel,
                                                                               neighborsIndexes, neighborsPosition, AllDirs, leafLevelSeparationCriteria);

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()),  __LINE__ );
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
                MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight];
                MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight];
                int iterRequest = 0;

                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
                        if(toSendAtProcAtLevel != 0){
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));

                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max());
                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
                                             int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc,
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }

                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                        if(toReceiveFromProcAtLevel){
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));

                            FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
                                             int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc,
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
                FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__);

                delete[] requests;
                delete[] status;

                FLOG(sendCounter.tac());
            }//End of Master region

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////

            #pragma omp single nowait
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                ////octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    const int fackLevel = idxLevel + offsetRealTree;

                    const int separationCriteria = (idxLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);

                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
                        const int chunckSize = FMath::Max(1, numberOfCells/(omp_get_num_threads()*omp_get_num_threads()));
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
                            #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                                const CellClass* neighbors[343];

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
                                    const int counter = tree->getPeriodicInteractionNeighbors(neighbors,
                                                                            iterArray[idxCellToCompute].getCurrentGlobalCoordinate(),
                                                                            idxLevel, AllDirs, separationCriteria);

                                    if(counter)
                                        myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, counter, fackLevel);
                                }
                            }
                        }
                    }//End of task spawning

                    #pragma omp taskwait

                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
                        #pragma omp task  default(none) firstprivate(idxThread,idxLevel)
                        {
                            kernels[idxThread]->finishedLevelM2L(fackLevel);
                        }
                    }
                    #pragma omp taskwait

                    FLOG(computationCounter.tac());
                }
            }
        }


        {
            FLOG(receiveCounter.tic());
            typename OctreeClass::Iterator octreeIterator(tree);
            ////octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // compute the second time
            // for each levels
            for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                const int fackLevel = idxLevel + offsetRealTree;

                const int separationCriteria = (fackLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);

                if(!procHasWorkAtLevel(idxLevel, idProcess)){
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;
                    continue;
                }

                // put the received data into a temporary tree
                FLightOctree<CellClass> tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();

                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
                            const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
                            const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
                            FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);

                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();

                            CellClass* const newCell = new CellClass;
                            newCell->setMortonIndex(cellIndex);
                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);

                            tempTree.insertCell(cellIndex, idxLevel, newCell);
                        }

                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
                int numberOfCells = 0;
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = nullptr;

                // Compute this cells
                FLOG(computationCounter.tic());
                #pragma omp parallel
                {
                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                    MortonIndex neighborsIndex[/*189+26+1*/216];
                    int neighborsPosition[/*189+26+1*/216];
                    const CellClass* neighbors[343];

                    #pragma omp for schedule(static) nowait
                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                        // compute indexes
                        memset(neighbors, 0, 343 * sizeof(CellClass*));
                        const int counterNeighbors = getPeriodicInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel, neighborsIndex, neighborsPosition, AllDirs, separationCriteria);
                        //const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition);

                        int counter = 0;
                        // does we receive this index from someone?
                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
                            if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
                                    || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){

                                CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);

                                if(otherCell){
                                    //otherCell->setMortonIndex(neighborsIndex[idxNeig]);
                                    neighbors[ neighborsPosition[idxNeig] ] = otherCell;
                                    ++counter;
                                }
                            }
                        }
                        // need to compute
                        if(counter){
                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, fackLevel);
                        }
                    }

                    myThreadkernels->finishedLevelM2L(fackLevel);
                }
                FLOG(computationCounter.tac());
            }
            FLOG(receiveCounter.tac());
        }

        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
            delete sendBuffer[idxComm];
            delete recvBuffer[idxComm];
        }
        for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
            delete leafsNeedOther[idxComm];
        }
        delete[] sendBuffer;
        delete[] recvBuffer;
        delete[] indexToSend;
        delete[] leafsNeedOther;
        delete[] globalReceiveMap;


        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );

    }

    //////////////////////////////////////////////////////////////////
    // ---------------- L2L ---------------
    //////////////////////////////////////////////////////////////////

    void downardPass(){ // second L2L
        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic waitCounter);

        // Start from leal level - 1
        typename OctreeClass::Iterator octreeIterator(tree);
        ////octreeIterator.moveDown();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
        // Max 1 receive and 7 send (but 7 times the same data)
        MPI_Request*const requests = new MPI_Request[8];
        MPI_Status*const status = new MPI_Status[8];
        MPI_Request*const requestsSize = new MPI_Request[8];
        MPI_Status*const statusSize = new MPI_Status[8];

        FMpiBufferWriter sendBuffer(comm.getComm());
        FMpiBufferReader recvBuffer(comm.getComm());

        int righestProcToSendTo   = nbProcess - 1;

        // Periodic
        if( idProcess == 0){
            rootCellFromProc.serializeDown(sendBuffer);
            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
            int sizeOfSerialization = int(sendBuffer.getSize());
            FMpi::MpiAssert( MPI_Bcast( &sizeOfSerialization, 1, MPI_INT, 0, comm.getComm() ), __LINE__ );
            FMpi::MpiAssert( MPI_Bcast( sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, comm.getComm() ), __LINE__ );
            sendBuffer.reset();
        }
        else{
            int sizeOfSerialization = -1;
            FMpi::MpiAssert( MPI_Bcast( &sizeOfSerialization, 1, MPI_INT, 0, comm.getComm() ), __LINE__ );
            recvBuffer.cleanAndResize(sizeOfSerialization);
            FMpi::MpiAssert( MPI_Bcast( recvBuffer.data(), sizeOfSerialization, MPI_BYTE, 0, comm.getComm() ), __LINE__ );
            rootCellFromProc.deserializeDown(recvBuffer);
            FAssertLF(recvBuffer.getSize() == sizeOfSerialization);
            recvBuffer.seek(0);
        }
        kernels[0]->L2L(&rootCellFromProc, octreeIterator.getCurrentBox(), offsetRealTree);


        // for each levels exepted leaf level
        for(int idxLevel = 1 ; idxLevel < OctreeHeight - 1 ; ++idxLevel ){
            const int fackLevel = idxLevel + offsetRealTree;
            // If nothing to do in the next level skip the current one
            if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;
                continue;
            }

            // Copy all the cells in an array even the one that are out of my working interval
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveDown();
            octreeIterator = avoidGotoLeftIterator;

            // Count the number of cells that are out of my working interval
            int nbCellsToSkip = 0;
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
                nbCellsToSkip += 1;
            }

            // Check if someone will send a cell to me
            bool hasToReceive = false;
            int idxProcToReceive = idProcess - 1;
            if(idProcess != 0 && nbCellsToSkip){
                // Starting from my left neighbor stop at the first proc that has work to do (not null interval)
                while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
                    idxProcToReceive -= 1;
                }
                // Check if we find such a proc and that it share a cell with us on the border
                if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
                    hasToReceive = true;
                }
            }
            #pragma omp parallel
            {
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
                #pragma omp single nowait
                {
                    FLOG(prepareCounter.tic());
                    int iterRequests = 0;
                    int iterRequestsSize = 0;
                    FSize recvBufferSize = 0;
                    FSize sendBufferSize;
                    // Post the receive
                    if(hasToReceive){
                        FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive,
                                                    FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
                    }

                    // We have to be sure that we are not sending if we have no work in the current level
                    if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
                        int idxProcSend = idProcess + 1;
                        int nbMessageSent = 0;
                        // From the proc on the right to righestProcToSendTo, check if we have to send something
                        while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
                            // We know that if the proc has work at the next level it share a cell with us due to the while condition
                            if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
                                FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
                                // If first message then serialize the cell to send
                                if( nbMessageSent == 0 ){
                                    // We send our last cell
                                    iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
                                    sendBufferSize = sendBuffer.getSize();
                                }
                                // Post the send message
                                FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
                                                           FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
                                FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, idxProcSend,
                                                           FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
                                // Inc and check the counter
                                nbMessageSent += 1;
                                FAssertLF(nbMessageSent <= 7);
                            }
                            idxProcSend += 1;
                        }
                        // Next time we will not need to go further than idxProcSend
                        righestProcToSendTo = idxProcSend;
                    }

                    // Finalize the communication
                    if(iterRequestsSize){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequestsSize <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, statusSize), __LINE__);
                        FLOG(waitCounter.tac());
                    }

                    if(hasToReceive){
                        recvBuffer.cleanAndResize(recvBufferSize);
                        FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_PACKED, idxProcToReceive,
                                                    FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
                    }

                    if(iterRequests){
                        FLOG(waitCounter.tic());
                        FAssertLF(iterRequests <= 8);
                        FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, status), __LINE__);
                        FLOG(waitCounter.tac());
                    }

                    // If we receive something proceed the L2L
                    if(hasToReceive){
                        FAssertLF(iterRequests != 0);
                        // In this case we know that we have to perform the L2L with the last cell that are
                        // exclude from our working interval nbCellsToSkip-1
                        iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
                        myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), fackLevel);
                    }
                    FLOG(prepareCounter.tac());
                }

                #pragma omp single nowait
                {
                    FLOG(computationCounter.tic());
                }
                // Threads are working on all the cell of our working interval at that level
                #pragma omp for nowait
                for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), fackLevel);
                }
            }
            FLOG(computationCounter.tac());

            sendBuffer.reset();
            recvBuffer.seek(0);
        }

        delete[] requests;
        delete[] status;

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
    }


    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////
    struct LeafData{
        FTreeCoordinate coord;
        CellClass* cell;
        ContainerClass* targets;
        ContainerClass* sources;
    };


    /** P2P */
    void directPass(const bool p2pEnabled, const bool l2pEnabled){
        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
        FLOG( FTic counterTime);
        FLOG( FTic prepareCounter);
        FLOG( FTic gatherCounter);
        FLOG( FTic waitCounter);
        FLOG(FTic computationCounter);
        FLOG(FTic computation2Counter);

        ///////////////////////////////////////////////////
        // Prepare data to send receive
        ///////////////////////////////////////////////////
        FLOG(prepareCounter.tic());

        // To send in asynchrone way
        MPI_Request requests[2 * nbProcess];
        MPI_Status status[2 * nbProcess];
        int iterRequest = 0;
        int nbMessagesToRecv = 0;

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);

        /* This a nbProcess x nbProcess matrix of integer
     * let U and V be id of processes :
     * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
     */
        FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
        memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);

        FBoolArray leafsNeedOther(this->numberOfLeafs);
        int countNeedOther = 0;

        // To store the result
        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );

        // init
        const int LeafIndex = OctreeHeight - 1;
        const int SizeShape = P2PExclusionClass::SizeShape;

        int shapeLeaf[SizeShape];
        memset(shapeLeaf,0,SizeShape*sizeof(int));

        LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];

        FVector<LeafData> leafsNeedOtherData(countNeedOther);

        FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
        FSize partsToSend[nbProcess];
        memset(partsToSend, 0, sizeof(FSize) * nbProcess);

#pragma omp parallel
        {
#pragma omp master // MUST WAIT to fill leafsNeedOther
            if(p2pEnabled){
                // Copy leafs
                {
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    int idxLeaf = 0;
                    do{
                        this->iterArray[idxLeaf++] = octreeIterator;
                    } while(octreeIterator.moveRight());
                }
                const int limite = 1 << (this->OctreeHeight - 1);
                int alreadySent[nbProcess];

                //Will store the indexes of the neighbors of current cell
                MortonIndex indexesNeighbors[26];
                int uselessIndexArray[26];

                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    memset(alreadySent, 0, sizeof(int) * nbProcess);
                    bool needOther = false;
                    //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
                    const int neighCount = getNeighborsIndexesPeriodic(iterArray[idxLeaf].getCurrentGlobalCoordinate(),limite,
                                                                       indexesNeighbors, uselessIndexArray, AllDirs);
                    //Loop over the neighbor leafs
                    for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
                        //Test if leaf belongs to someone else (false if it's mine)
                        if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
                            needOther = true;

                            // find the proc that will need current leaf
                            int procToReceive = idProcess;
                            while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].leftIndex){
                                --procToReceive; //scroll process "before" current process
                            }
                            while( procToReceive != nbProcess - 1 && (intervals[procToReceive].rightIndex) < indexesNeighbors[idxNeigh]){
                                ++procToReceive;//scroll process "after" current process
                            }
                            //  Test : Not Already Send && be sure someone hold this interval
                            if( !alreadySent[procToReceive] && intervals[procToReceive].leftIndex <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].rightIndex){

                                alreadySent[procToReceive] = 1;
                                toSend[procToReceive].push( iterArray[idxLeaf] );
                                partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize();
                                partsToSend[procToReceive] += int(sizeof(MortonIndex));
                            }
                        }
                    }

                    if(needOther){ //means that something need to be sent (or received)
                        leafsNeedOther.set(idxLeaf,true);
                        ++countNeedOther;
                    }
                }

                // No idea why it is mandatory there, could it be a few line before,
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(partsToSend[idxProc]){
                        partsToSend[idxProc] += int(sizeof(FSize));
                    }
                }
            }

#pragma omp barrier

#pragma omp master // nowait
            if(p2pEnabled){
                //Share to all processus globalReceiveMap
                FLOG(gatherCounter.tic());
                FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend),
                                                globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), comm.getComm()),  __LINE__ );
                FLOG(gatherCounter.tac());

                //Prepare receive
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
                        //allocate buffer of right size
                        recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]);
                        FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max());
                        FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED,
                                                   idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                    }
                }

                nbMessagesToRecv = iterRequest;
                // Prepare send
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                    if(toSend[idxProc].getSize() != 0){
                        sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]);
                        // << is equivalent to write().
                        (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
                        for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
                            (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
                            toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
                        }

                        FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]);
                        FAssertLF(sendBuffer[idxProc]->getSize() < std::numeric_limits<int>::max());
                        FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), int(sendBuffer[idxProc]->getSize()) , MPI_PACKED ,
                                                    idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );

                    }
                }

                delete[] toSend;


                //////////////////////////////////////////////////////////
                // Waitsend receive
                //////////////////////////////////////////////////////////

                // Wait data
                FLOG(waitCounter.tic());
                MPI_Waitall(iterRequest, requests, status);
                FLOG(waitCounter.tac());

                for(int idxRcv = 0 ; idxRcv < nbMessagesToRecv ; ++idxRcv){
                    const int idxProc = status[idxRcv].MPI_SOURCE;
                    FSize nbLeaves;
                    (*recvBuffer[idxProc]) >> nbLeaves;
                    for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
                        MortonIndex leafIndex;
                        (*recvBuffer[idxProc]) >> leafIndex;
                        otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc]));
                    }
                    delete recvBuffer[idxProc];
                    recvBuffer[idxProc] = nullptr;
                }
            }

            ///////////////////////////////////////////////////
            // Prepare data for thread P2P
            ///////////////////////////////////////////////////

#pragma omp single // MUST WAIT!
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();

                // to store which shape for each leaf
                typename OctreeClass::Iterator* const myLeafs = new typename OctreeClass::Iterator[this->numberOfLeafs];
                int*const shapeType = new int[this->numberOfLeafs];

                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
                    myLeafs[idxLeaf] = octreeIterator;

                    const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
                    const int shape = P2PExclusionClass::GetShapeIdx(coord);
                    shapeType[idxLeaf] = shape;

                    ++shapeLeaf[shape];

                    octreeIterator.moveRight();
                }

                int startPosAtShape[SizeShape];
                startPosAtShape[0] = 0;
                for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
                    startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaf[idxShape-1];
                }

                int idxInArray = 0;
                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf, ++idxInArray){
                    const int shapePosition = shapeType[idxInArray];

                    leafsDataArray[startPosAtShape[shapePosition]].coord = myLeafs[idxInArray].getCurrentGlobalCoordinate();
                    leafsDataArray[startPosAtShape[shapePosition]].cell = myLeafs[idxInArray].getCurrentCell();
                    leafsDataArray[startPosAtShape[shapePosition]].targets = myLeafs[idxInArray].getCurrentListTargets();
                    leafsDataArray[startPosAtShape[shapePosition]].sources = myLeafs[idxInArray].getCurrentListSrc();
                    if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData.push(leafsDataArray[startPosAtShape[shapePosition]]);

                    ++startPosAtShape[shapePosition];
                }

                delete[] shapeType;
                delete[] myLeafs;

                FLOG(prepareCounter.tac());
            }


            //////////////////////////////////////////////////////////
            // Computation P2P that DO NOT need others data
            //////////////////////////////////////////////////////////

            {
#pragma omp single nowait
                {
                    FLOG(computationCounter.tic());
                    int previous = 0;
                    const FReal boxWidth = tree->getBoxWidth();

                    for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
                        const int endAtThisShape = shapeLeaf[idxShape] + previous;
                        const int chunckSize = FMath::Max(1, (endAtThisShape-previous)/(omp_get_num_threads()*omp_get_num_threads()));

                        for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; idxLeafs += chunckSize){
                            const int nbLeavesInTask = FMath::Min(endAtThisShape-idxLeafs, chunckSize);
#pragma omp task default(none) firstprivate(nbLeavesInTask,idxLeafs) //+ shared(leafsDataArray)
                            {
                                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);

                                // There is a maximum of 26 neighbors
                                ContainerClass* neighbors[27];
                                FTreeCoordinate offsets[27];
                                bool hasPeriodicLeaves;
                                for(int idxTaskLeaf = idxLeafs ; idxTaskLeaf < (idxLeafs + nbLeavesInTask) ; ++idxTaskLeaf){
                                    LeafData& currentIter = leafsDataArray[idxTaskLeaf];
                                    if(l2pEnabled){
                                        myThreadkernels->L2P(currentIter.cell, currentIter.targets);
                                    }
                                    if(p2pEnabled){
                                        // need the current particles and neighbors particles
                                        const int counter = tree->getPeriodicLeafsNeighbors(neighbors, offsets, &hasPeriodicLeaves,
                                                                                            currentIter.coord, LeafIndex, AllDirs);
                                        int periodicNeighborsCounter = 0;

                                        if(hasPeriodicLeaves){
                                            ContainerClass* periodicNeighbors[27];
                                            memset(periodicNeighbors, 0, 27 * sizeof(ContainerClass*));

                                            for(int idxNeig = 0 ; idxNeig < 27 ; ++idxNeig){
                                                if( neighbors[idxNeig] && !offsets[idxNeig].equals(0,0,0) ){
                                                    // Put periodic neighbors into other array
                                                    periodicNeighbors[idxNeig] = neighbors[idxNeig];
                                                    neighbors[idxNeig] = nullptr;
                                                    ++periodicNeighborsCounter;

                                                    FReal*const positionsX = periodicNeighbors[idxNeig]->getWPositions()[0];
                                                    FReal*const positionsY = periodicNeighbors[idxNeig]->getWPositions()[1];
                                                    FReal*const positionsZ = periodicNeighbors[idxNeig]->getWPositions()[2];

                                                    for(FSize idxPart = 0; idxPart < periodicNeighbors[idxNeig]->getNbParticles() ; ++idxPart){
                                                        positionsX[idxPart] += boxWidth * FReal(offsets[idxNeig].getX());
                                                        positionsY[idxPart] += boxWidth * FReal(offsets[idxNeig].getY());
                                                        positionsZ[idxPart] += boxWidth * FReal(offsets[idxNeig].getZ());
                                                    }
                                                }
                                            }

                                            myThreadkernels->P2PRemote(currentIter.coord,currentIter.targets,
                                                                       currentIter.sources, periodicNeighbors, periodicNeighborsCounter);

                                            for(int idxNeig = 0 ; idxNeig < 27 ; ++idxNeig){
                                                if( periodicNeighbors[idxNeig] ){
                                                    FReal*const positionsX = periodicNeighbors[idxNeig]->getWPositions()[0];
                                                    FReal*const positionsY = periodicNeighbors[idxNeig]->getWPositions()[1];
                                                    FReal*const positionsZ = periodicNeighbors[idxNeig]->getWPositions()[2];

                                                    for(FSize idxPart = 0; idxPart < periodicNeighbors[idxNeig]->getNbParticles() ; ++idxPart){
                                                        positionsX[idxPart] -= boxWidth * FReal(offsets[idxNeig].getX());
                                                        positionsY[idxPart] -= boxWidth * FReal(offsets[idxNeig].getY());
                                                        positionsZ[idxPart] -= boxWidth * FReal(offsets[idxNeig].getZ());
                                                    }
                                                }
                                            }
                                        }

                                        myThreadkernels->P2P( currentIter.coord, currentIter.targets,
                                                          currentIter.sources, neighbors, counter - periodicNeighborsCounter);
                                    }
                                }

                            }
                        }
                        previous = endAtThisShape;
                    }

#pragma omp taskwait
                }
                FLOG(computationCounter.tac());
            }
        }

        // Wait the come to finish (and the previous computation also)
#pragma omp barrier


        //////////////////////////////////////////////////////////
        // Computation P2P that need others data
        //////////////////////////////////////////////////////////
#pragma omp master
        { FLOG( computation2Counter.tic() ); }

        if(p2pEnabled){
            KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
            // There is a maximum of 26 neighbors
            ContainerClass* neighbors[27];
            MortonIndex indexesNeighbors[27];
            int indexArray[26];
            // Box limite
            const int limite = 1 << (this->OctreeHeight - 1);
            FAssertLF(leafsNeedOtherData.getSize() < std::numeric_limits<int>::max());
            const int nbLeafToProceed = int(leafsNeedOtherData.getSize());

#pragma omp for schedule(static)
            for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){
                LeafData currentIter = leafsNeedOtherData[idxLeafs];

                // need the current particles and neighbors particles
                int counter = 0;
                memset( neighbors, 0, sizeof(ContainerClass*) * 27);

                // Take possible data
                const int nbNeigh = getNeighborsIndexesPeriodic(currentIter.coord,limite,
                                                                indexesNeighbors,indexArray,AllDirs);

                for(int idxNeigh = 0 ; idxNeigh < nbNeigh ; ++idxNeigh){
                    if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
                        ContainerClass*const hypotheticNeighbor = otherP2Ptree.getLeafSrc(indexesNeighbors[idxNeigh]);
                        if(hypotheticNeighbor){
                            neighbors[ indexArray[idxNeigh] ] = hypotheticNeighbor;
                            ++counter;
                        }
                    }
                }
                myThreadkernels.P2PRemote( currentIter.coord, currentIter.targets,
                                           currentIter.sources, neighbors, counter);

            }

        }


        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
            delete sendBuffer[idxProc];
            delete recvBuffer[idxProc];
        }
        delete[] globalReceiveMap;
        delete[] leafsDataArray;

        FLOG(computation2Counter.tac());


        FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Computation P2P 2 : " << computation2Counter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" );

    }


    int getNeighborsIndexesPeriodic(const FTreeCoordinate& center, const int boxLimite, MortonIndex indexes[26], int indexInArray[26], const int inDirection) const{
        if(center.getX() != 0 && center.getY() != 0 && center.getZ() != 0 &&
                center.getX() != boxLimite -1 && center.getY() != boxLimite -1  && center.getZ() != boxLimite -1 ){
            return getNeighborsIndexes(center, indexes, indexInArray);
        }

        int idxNeig = 0;

        const int startX = (TestPeriodicCondition(inDirection, DirMinusX) || center.getX() != 0 ?-1:0);
        const int endX = (TestPeriodicCondition(inDirection, DirPlusX) || center.getX() != boxLimite - 1 ?1:0);
        const int startY = (TestPeriodicCondition(inDirection, DirMinusY) || center.getY() != 0 ?-1:0);
        const int endY = (TestPeriodicCondition(inDirection, DirPlusY) || center.getY() != boxLimite - 1 ?1:0);
        const int startZ = (TestPeriodicCondition(inDirection, DirMinusZ) || center.getZ() != 0 ?-1:0);
        const int endZ = (TestPeriodicCondition(inDirection, DirPlusZ) || center.getZ() != boxLimite - 1 ?1:0);

        // We test all cells around
        for(int idxX = startX ; idxX <= endX ; ++idxX){
            for(int idxY = startY ; idxY <= endY ; ++idxY){
                for(int idxZ = startZ ; idxZ <= endZ ; ++idxZ){
                    // if we are not on the current cell
                    if( idxX || idxY || idxZ ){
                        FTreeCoordinate other(center.getX() + idxX,center.getY() + idxY,center.getZ() + idxZ);

                        if( other.getX() < 0 ) other.setX( other.getX() + boxLimite );
                        else if( boxLimite <= other.getX() ) other.setX( other.getX() - boxLimite );
                        if( other.getY() < 0 ) other.setY( other.getY() + boxLimite );
                        else if( boxLimite <= other.getY() ) other.setY( other.getY() - boxLimite );
                        if( other.getZ() < 0 ) other.setZ( other.getZ() + boxLimite );
                        else if( boxLimite <= other.getZ() ) other.setZ( other.getZ() - boxLimite );

                        indexes[idxNeig] = other.getMortonIndex(this->OctreeHeight - 1);
                        indexInArray[idxNeig] = ((idxX+1)*3  + (idxY+1))*3 + (idxZ+1);
                        ++idxNeig;
                    }
                }
            }
        }
        return idxNeig;
    }

    int getNeighborsIndexes(const FTreeCoordinate& center, MortonIndex indexes[26], int indexInArray[26]) const{
        int idxNeig = 0;

        // We test all cells around
        for(int idxX = -1 ; idxX <= 1 ; ++idxX){
            for(int idxY = -1 ; idxY <= 1 ; ++idxY){
                for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){
                    // if we are not on the current cell
                    if( idxX || idxY || idxZ ){
                        const FTreeCoordinate other(center.getX() + idxX,center.getY() + idxY,center.getZ() + idxZ);
                        indexes[idxNeig] = other.getMortonIndex(this->OctreeHeight - 1);
                        indexInArray[idxNeig] = ((idxX+1)*3  + (idxY+1))*3 + (idxZ+1);
                        ++idxNeig;
                    }
                }
            }
        }
        return idxNeig;
    }


    int getPeriodicInteractionNeighbors(const FTreeCoordinate& workingCell,const int inLevel, MortonIndex inNeighbors[/*189+26+1*/216], int inNeighborsPosition[/*189+26+1*/216],
                const int inDirection, const int neighSeparation) const{

        // Then take each child of the parent's neighbors if not in directNeighbors
        // Father coordinate
        const FTreeCoordinate parentCell(workingCell.getX()>>1,workingCell.getY()>>1,workingCell.getZ()>>1);

        // Limite at parent level number of box (split by 2 by level)
        const int boxLimite = FMath::pow2(inLevel-1);

        // This is not on a border we can use normal interaction list method
        if( !(parentCell.getX() == 0 || parentCell.getY() == 0 || parentCell.getZ() == 0 ||
              parentCell.getX() == boxLimite - 1 || parentCell.getY() == boxLimite - 1 || parentCell.getZ() == boxLimite - 1 ) ) {
            return getInteractionNeighbors( workingCell, inLevel, inNeighbors, inNeighborsPosition, neighSeparation);
        }

        const int startX =  (TestPeriodicCondition(inDirection, DirMinusX) || parentCell.getX() != 0 ?-1:0);
        const int endX =    (TestPeriodicCondition(inDirection, DirPlusX)  || parentCell.getX() != boxLimite - 1 ?1:0);
        const int startY =  (TestPeriodicCondition(inDirection, DirMinusY) || parentCell.getY() != 0 ?-1:0);
        const int endY =    (TestPeriodicCondition(inDirection, DirPlusY)  || parentCell.getY() != boxLimite - 1 ?1:0);
        const int startZ =  (TestPeriodicCondition(inDirection, DirMinusZ) || parentCell.getZ() != 0 ?-1:0);
        const int endZ =    (TestPeriodicCondition(inDirection, DirPlusZ)  || parentCell.getZ() != boxLimite - 1 ?1:0);

        int idxNeighbors = 0;
        // We test all cells around
        for(int idxX = startX ; idxX <= endX ; ++idxX){
            for(int idxY = startY ; idxY <= endY ; ++idxY){
                for(int idxZ = startZ ; idxZ <= endZ ; ++idxZ){
                    // if we are not on the current cell
                    if( idxX || idxY || idxZ ){

                        const FTreeCoordinate otherParent(parentCell.getX() + idxX,parentCell.getY() + idxY,parentCell.getZ() + idxZ);
                        FTreeCoordinate otherParentInBox(otherParent);
                        // periodic
                        if( otherParentInBox.getX() < 0 ){
                            otherParentInBox.setX( otherParentInBox.getX() + boxLimite );
                        }
                        else if( boxLimite <= otherParentInBox.getX() ){
                            otherParentInBox.setX( otherParentInBox.getX() - boxLimite );
                        }

                        if( otherParentInBox.getY() < 0 ){
                            otherParentInBox.setY( otherParentInBox.getY() + boxLimite );
                        }
                        else if( boxLimite <= otherParentInBox.getY() ){
                            otherParentInBox.setY( otherParentInBox.getY() - boxLimite );
                        }

                        if( otherParentInBox.getZ() < 0 ){
                            otherParentInBox.setZ( otherParentInBox.getZ() + boxLimite );
                        }
                        else if( boxLimite <= otherParentInBox.getZ() ){
                            otherParentInBox.setZ( otherParentInBox.getZ() - boxLimite );
                        }

                        const MortonIndex mortonOtherParent = otherParentInBox.getMortonIndex(inLevel-1);

                        // For each child
                        for(int idxCousin = 0 ; idxCousin < 8 ; ++idxCousin){
                            const int xdiff  = ((otherParent.getX()<<1) | ( (idxCousin>>2) & 1)) - workingCell.getX();
                            const int ydiff  = ((otherParent.getY()<<1) | ( (idxCousin>>1) & 1)) - workingCell.getY();
                            const int zdiff  = ((otherParent.getZ()<<1) | (idxCousin&1))         - workingCell.getZ();

                            // Test if it is a direct neighbor
                            if(FMath::Abs(xdiff) > neighSeparation || FMath::Abs(ydiff) > neighSeparation || FMath::Abs(zdiff) > neighSeparation){
                                // add to neighbors
                                inNeighborsPosition[idxNeighbors] = (((xdiff+3) * 7) + (ydiff+3)) * 7 + zdiff + 3;
                                inNeighbors[idxNeighbors++] = (mortonOtherParent << 3) | idxCousin;
                            }
                        }
                    }
                }
            }
        }

        return idxNeighbors;
    }

    int getInteractionNeighbors(const FTreeCoordinate& workingCell,const int inLevel, MortonIndex inNeighbors[/*189+26+1*/216], int inNeighborsPosition[/*189+26+1*/216],
                const int neighSeparation) const{

        // Then take each child of the parent's neighbors if not in directNeighbors
        // Father coordinate
        const FTreeCoordinate parentCell(workingCell.getX()>>1,workingCell.getY()>>1,workingCell.getZ()>>1);

        int idxNeighbors = 0;
        // We test all cells around
        for(int idxX = -1 ; idxX <= 1 ; ++idxX){
            for(int idxY = -1 ; idxY <= 1 ; ++idxY){
                for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){
                    // if we are not on the current cell
                    if( idxX || idxY || idxZ ){

                        const FTreeCoordinate otherParent(parentCell.getX() + idxX,parentCell.getY() + idxY,parentCell.getZ() + idxZ);

                        const MortonIndex mortonOtherParent = otherParent.getMortonIndex(inLevel-1);

                        // For each child
                        for(int idxCousin = 0 ; idxCousin < 8 ; ++idxCousin){
                            const int xdiff  = ((otherParent.getX()<<1) | ( (idxCousin>>2) & 1)) - workingCell.getX();
                            const int ydiff  = ((otherParent.getY()<<1) | ( (idxCousin>>1) & 1)) - workingCell.getY();
                            const int zdiff  = ((otherParent.getZ()<<1) | (idxCousin&1)) - workingCell.getZ();

                            // Test if it is a direct neighbor
                            if(FMath::Abs(xdiff) > neighSeparation || FMath::Abs(ydiff) > neighSeparation || FMath::Abs(zdiff) > neighSeparation){
                                // add to neighbors
                                inNeighborsPosition[idxNeighbors] = ((( (xdiff+3) * 7) + (ydiff+3))) * 7 + zdiff + 3;
                                inNeighbors[idxNeighbors++] = (mortonOtherParent << 3) | idxCousin;
                            }
                        }
                    }
                }
            }
        }

        return idxNeighbors;
    }

    /////////////////////////////////////////////////////////////////////////////
    // Periodic levels = levels <= 0
    /////////////////////////////////////////////////////////////////////////////


    /** Get the index of a interaction neighbors (for M2L)
     * @param x the x position in the interactions (from -3 to +3)
     * @param y the y position in the interactions (from -3 to +3)
     * @param z the z position in the interactions (from -3 to +3)
     * @return the index (from 0 to 342)
     */
    int neighIndex(const int x, const int y, const int z) const {
        return (((x+3)*7) + (y+3))*7 + (z + 3);
    }

    /** Periodicity Core
     * This function is split in several part:
     * 1 - special case managment
     * There is nothing to do if nbLevelsAboveRoot == -1 and only
     * a M2L if nbLevelsAboveRoot == 0
     * 2 - if nbLevelsAboveRoot > 0
     * First we compute M2M and special M2M if needed for the border
     * Then the M2L by taking into account the periodicity directions
     * Then the border by using the precomputed M2M
     * Finally the L2L
     */
    void processPeriodicLevels(){
        FLOG( FLog::Controller.write("\tStart Periodic Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);

        if( nbLevelsAboveRoot != -1 ){
            // we will use offsetRealTree-1 cells but for simplicity allocate offsetRealTree
            // upperCells[offsetRealTree-1] is root cell
            CellClass*const upperCells = new CellClass[offsetRealTree];
//            {
//                typename OctreeClass::Iterator octreeIterator(tree);
//                octreeIterator.gotoLeft();
//                kernels[0]->M2M( &upperCells[offsetRealTree-1], octreeIterator.getCurrentBox(), offsetRealTree);
//            }
            {
                CellClass* virtualChild[8];
                const int idxLevel = offsetRealTree-1;
                FMemUtils::setall(virtualChild,&rootCellFromProc,8);
                kernels[0]->M2M( &upperCells[idxLevel-1], virtualChild, idxLevel);
            }
            {
                CellClass* virtualChild[8];
                for(int idxLevel = offsetRealTree-1-1 ; idxLevel > 1  ; --idxLevel){
                    FMemUtils::setall(virtualChild,&upperCells[idxLevel],8);
                    kernels[0]->M2M( &upperCells[idxLevel-1], virtualChild, idxLevel);
                }
            }
            CellClass*const downerCells = new CellClass[offsetRealTree];

            {
                const int idxUpperLevel = 2;

                const CellClass* neighbors[343];
                memset(neighbors, 0, sizeof(CellClass*) * 343);
                int counter = 0;
                for(int idxX = -2 ; idxX <= 1 ; ++idxX){
                    for(int idxY = -2 ; idxY <= 1 ; ++idxY){
                        for(int idxZ = -2 ; idxZ <= 1 ; ++idxZ){
                            if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1){
                                neighbors[neighIndex(idxX,idxY,idxZ)] = &upperCells[idxUpperLevel-1];
                                ++counter;
                            }
                        }
                    }
                }
                // compute M2L
                kernels[0]->M2L( &downerCells[idxUpperLevel-1] , neighbors, counter, idxUpperLevel);
            }

            for(int idxUpperLevel = 3 ; idxUpperLevel <= offsetRealTree-1 ; ++idxUpperLevel){
                const CellClass* neighbors[343];
                memset(neighbors, 0, sizeof(CellClass*) * 343);
                int counter = 0;
                for(int idxX = -2 ; idxX <= 3 ; ++idxX){
                    for(int idxY = -2 ; idxY <= 3 ; ++idxY){
                        for(int idxZ = -2 ; idxZ <= 3 ; ++idxZ){
                            if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1){
                                neighbors[neighIndex(idxX,idxY,idxZ)] = &upperCells[idxUpperLevel-1];
                                ++counter;
                            }
                        }
                    }
                }

                // compute M2L
                kernels[0]->M2L( &downerCells[idxUpperLevel-1] , neighbors, counter, idxUpperLevel);
            }
            {
                const int idxUpperLevel = offsetRealTree;
                const CellClass* neighbors[343];
                memset(neighbors, 0, sizeof(CellClass*) * 343);
                int counter = 0;
                for(int idxX = -2 ; idxX <= 3 ; ++idxX){
                    for(int idxY = -2 ; idxY <= 3 ; ++idxY){
                        for(int idxZ = -2 ; idxZ <= 3 ; ++idxZ){
                            if( FMath::Abs(idxX) > 1 || FMath::Abs(idxY) > 1 || FMath::Abs(idxZ) > 1){
                                neighbors[neighIndex(idxX,idxY,idxZ)] = &rootCellFromProc;
                                ++counter;
                            }
                        }
                    }
                }

                // compute M2L
                kernels[0]->M2L( &rootCellFromProc , neighbors, counter, idxUpperLevel);
            }

            {
                CellClass* virtualChild[8];
                memset(virtualChild, 0, sizeof(CellClass*) * 8);
                for(int idxLevel = 2 ; idxLevel <= offsetRealTree-1-1  ; ++idxLevel){
                    virtualChild[0] = &downerCells[idxLevel];
                    kernels[0]->L2L( &downerCells[idxLevel-1], virtualChild, idxLevel);
                }
            }
            {
                CellClass* virtualChild[8];
                memset(virtualChild, 0, sizeof(CellClass*) * 8);
                const int idxLevel = offsetRealTree-1;
                virtualChild[0] = &rootCellFromProc;
                kernels[0]->L2L( &downerCells[idxLevel-1], virtualChild, idxLevel);
            }

            // L2L from 0 to level 1
//            {
//                typename OctreeClass::Iterator octreeIterator(tree);
//                octreeIterator.gotoLeft();
//                kernels[0]->L2L( &downerCells[offsetRealTree-1], octreeIterator.getCurrentBox(), offsetRealTree);
//            }

            delete[] upperCells;
            delete[] downerCells;
        }

        FLOG( FLog::Controller << "\tFinished (@Periodic = "  << counterTime.tacAndElapsed() << "s)\n" );
    }

};


#endif //FFMMALGORITHMTHREAD_HPP