FFmmAlgorithmThreadProcPeriodic.hpp 102 KB
Newer Older
1
// ===================================================================================
2 3 4 5 6
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
// This software is governed by the CeCILL-C and LGPL licenses and
7 8
// abiding by the rules of distribution of free software.
//
9 10 11 12
// This program is distributed in the hope that it will be useful,
// but WITHOUT ANY WARRANTY; without even the implied warranty of
// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
// GNU General Public and CeCILL-C Licenses for more details.
13
// "http://www.cecill.info".
14
// "http://www.gnu.org/licenses".
15
// ===================================================================================
berenger-bramas's avatar
berenger-bramas committed
16 17
#ifndef FFMMALGORITHMTHREADPROCPPERIODIC_HPP
#define FFMMALGORITHMTHREADPROCPPERIODIC_HPP
18

19

BRAMAS Berenger's avatar
BRAMAS Berenger committed
20
#include "../Utils/FAssert.hpp"
BRAMAS Berenger's avatar
BRAMAS Berenger committed
21
#include "../Utils/FLog.hpp"
22

23 24
#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"
25
#include "../Utils/FMemUtils.hpp"
26 27 28 29 30

#include "../Containers/FBoolArray.hpp"
#include "../Containers/FOctree.hpp"
#include "../Containers/FLightOctree.hpp"

berenger-bramas's avatar
berenger-bramas committed
31 32
#include "../Containers/FBufferWriter.hpp"
#include "../Containers/FBufferReader.hpp"
33 34
#include "../Containers/FMpiBufferWriter.hpp"
#include "../Containers/FMpiBufferReader.hpp"
berenger-bramas's avatar
berenger-bramas committed
35

36 37 38 39
#include "../Utils/FMpi.hpp"

#include <omp.h>

40
#include "FCoreCommon.hpp"
41
#include "FP2PExclusion.hpp"
42

43 44
#include <memory>

45
/**
46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64
 * @author Berenger Bramas (berenger.bramas@inria.fr)
 * @class FFmmAlgorithmThreadProcPeriodic
 * @brief
 * Please read the license
 *
 * This class is a threaded FMM algorithm with mpi.
 * It just iterates on a tree and call the kernels with good arguments.
 * It used the inspector-executor model :
 * iterates on the tree and builds an array to work in parallel on this array
 *
 * Of course this class does not deallocate pointer given in arguements.
 *
 * Threaded & based on the inspector-executor model
 * schedule(runtime) export OMP_NUM_THREADS=2
 * export OMPI_CXX=`which g++-4.4`
 * mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 * --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
 * ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
 */
65
template<class FReal, class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
BRAMAS Berenger's avatar
BRAMAS Berenger committed
66
class FFmmAlgorithmThreadProcPeriodic : public FAbstractAlgorithm {
67
    OctreeClass* const tree;                 //< The octree to work on
68
    KernelClass** kernels;                   //< The kernels
69 70

    const FMpi::FComm& comm;                 //< MPI comm
71

72 73 74
    CellClass rootCellFromProc;     //< root of tree needed by the periodicity
    const int nbLevelsAboveRoot;    //< The nb of level the user ask to go above the tree (>= -1)
    const int offsetRealTree;       //< nbLevelsAboveRoot GetFackLevel
75 76

    typename OctreeClass::Iterator* iterArray;
77
    typename OctreeClass::Iterator* iterArrayComm;  //Will be used to store pointers to cells/leafs to send/rcv
78 79 80 81 82 83 84 85 86
    int numberOfLeafs;                          //< To store the size at the previous level

    const int MaxThreads;               //< the max number of thread allowed by openmp

    const int nbProcess;                //< Number of process
    const int idProcess;                //< Id of current process

    const int OctreeHeight;

87
    const int leafLevelSeparationCriteria;
88

89
public:
90
    struct Interval{
91 92
        MortonIndex leftIndex;
        MortonIndex rightIndex;
93 94
    };

95
    Interval& getWorkingInterval(const int level, const int proc)const {
96
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
97
    }
98 99 100
private:
    Interval*const intervals;
    Interval*const workingIntervalsPerLevel;
101 102 103

    /** To know if a proc has work at a given level (if it hold cells and was responsible of them) */
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
104
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
105 106
    }

107 108
    /** Return true if the idxProc left cell at idxLevel+1 has the same parent as us for our right cell */
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
109
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
110 111 112 113
    }

    /** Return true if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
114
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
115
    }
116

117 118 119
public:

    void setKernel(KernelClass*const inKernels){
120
        this->kernels = new KernelClass*[MaxThreads];
121 122
        #pragma omp parallel num_threads(MaxThreads)
        {
123
            #pragma omp critical (InitFFmmAlgorithmThreadProcPeriodic)
124
            {
125
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
126
            }
127
        }
128 129
    }

130

131
    Interval& getWorkingInterval(const int level){
132
        return getWorkingInterval(level, idProcess);
133 134 135
    }

    bool hasWorkAtLevel(const int level){
136
        return idProcess == 0 || getWorkingInterval(level, idProcess - 1).rightIndex < getWorkingInterval(level, idProcess).rightIndex;
137 138 139
    }

    /** The constructor need the octree and the kernels used for computation
140 141 142 143
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
     * An assert is launched if one of the arguments is null
     */
144
    FFmmAlgorithmThreadProcPeriodic(const FMpi::FComm& inComm, OctreeClass* const inTree,
145
                                    const int inUpperLevel = 2, const int inLeafLevelSeperationCriteria = 1)
146 147 148
        : tree(inTree) , kernels(nullptr), comm(inComm), nbLevelsAboveRoot(inUpperLevel), offsetRealTree(inUpperLevel + 3),
          numberOfLeafs(0),
          MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
149
          OctreeHeight(tree->getHeight()),
150
          leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
BRAMAS Berenger's avatar
BRAMAS Berenger committed
151 152
          intervals(new Interval[inComm.processCount()]),
          workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
153 154 155

        FAssertLF(tree, "tree cannot be null");
        FAssertLF(-1 <= inUpperLevel, "inUpperLevel cannot be < -1");
156
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
157

158 159
        FAbstractAlgorithm::setNbLevelsInTree(extendedTreeHeight());

160 161
        FLOG(FLog::Controller << "FFmmAlgorithmThreadProcPeriodic\n");
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
162 163 164 165
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadProcPeriodic(){
166 167 168 169
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
170

171 172
        delete [] intervals;
        delete [] workingIntervalsPerLevel;
173 174
    }

175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217



    long long int theoricalRepetition() const {
        if( nbLevelsAboveRoot == -1 ){
            // we know it is 3 (-1;+1)
            return 3;
        }
        // Else we find the repetition in one dir and double it
        const long long int oneDirectionRepetition = (1<<(nbLevelsAboveRoot+2)); // 2^nbLevelsAboveRoot in each dim
        const long long int fullRepetition = 2 * oneDirectionRepetition;
        return fullRepetition;
    }


    void repetitionsIntervals(FTreeCoordinate*const min, FTreeCoordinate*const max) const {
        if( nbLevelsAboveRoot == -1 ){
            // We know it is (-1;1)
            min->setPosition(-1,-1,-1);
            max->setPosition(1,1,1);
        }
        else{
            const int halfRepeated = int(theoricalRepetition()/2);
            min->setPosition(-halfRepeated,-halfRepeated,-halfRepeated);
            // if we repeat the box 8 times, we go from [-4 to 3]
            max->setPosition(halfRepeated-1,halfRepeated-1,halfRepeated-1);
        }
    }


    FReal extendedBoxWidth() const {
        // The simulation box is repeated is repeated 4 times if nbLevelsAboveRoot==-1
        // And then it doubles by two
        return tree->getBoxWidth() * FReal(1<<(nbLevelsAboveRoot+3));
    }

    /** This function has to be used to init the kernel with correct args
     * it return the box cneter seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalBoxWidth and originalBoxCenter given in parameter
     * @param originalBoxCenter the real system center
     * @param originalBoxWidth the real system size
     * @return the center the kernel should use
     */
218
    FPoint<FReal> extendedBoxCenter() const {
219 220
        const FReal originalBoxWidth     = tree->getBoxWidth();
        const FReal originalBoxWidthDiv2 = originalBoxWidth/2.0;
221
        const FPoint<FReal> originalBoxCenter   = tree->getBoxCenter();
222 223

        const FReal offset = extendedBoxWidth()/FReal(2.0);
224
        return FPoint<FReal>( originalBoxCenter.getX() - originalBoxWidthDiv2 + offset,
225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241
                       originalBoxCenter.getY() - originalBoxWidthDiv2 + offset,
                       originalBoxCenter.getZ() - originalBoxWidthDiv2 + offset);
    }

    /** This function has to be used to init the kernel with correct args
     * it return the tree heigh seen from a kernel point of view from the periodicity the user ask for
     * this is computed using the originalTreeHeight given in parameter
     * @param originalTreeHeight the real tree heigh
     * @return the heigh the kernel should use
     */
    int extendedTreeHeight() const {
        // The real height
        return OctreeHeight + offsetRealTree;
    }


protected:
242
    /**
243 244 245
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
246
    void executeCore(const unsigned operationsToProceed) override {
247 248 249
        // Count leaf
        this->numberOfLeafs = 0;
        {
250 251
            Interval myFullInterval;
            {//Building the interval with the first and last leaves (and count the number of leaves)
252 253
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
254
                myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
255 256 257
                do{
                    ++this->numberOfLeafs;
                } while(octreeIterator.moveRight());
258
                myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
259
            }
260 261 262 263 264 265 266 267
            // Allocate a number to store the pointer of the cells at a level
            iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
            iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
            FAssertLF(iterArray,     "iterArray     bad alloc");
            FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()),  __LINE__ );
268

269 270 271 272
            // Build my intervals for all levels
            std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
            // At leaf level we know it is the full interval
            myIntervals[OctreeHeight - 1] = myFullInterval;
273

274
            // We can estimate the interval for each level by using the parent/child relation
275 276 277 278
            for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
            }
279 280 281

            // Process 0 uses the estimates as real intervals, but other processes
            // should remove cells that belong to others
282
            if(idProcess != 0){
283
                //We test for each level if process on left (idProcess-1) own cell I thought I owned
284 285 286 287
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.gotoBottomLeft();
                octreeIterator.moveUp();

288 289 290 291 292
                // At h-1 the working limit is the parent of the right cell of the proc on the left
                MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;

                // We check if we have no more work to do
                int nullIntervalFromLevel = 0;
293

294 295 296 297 298 299 300
                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                        if( !octreeIterator.moveRight() ){
                            // We cannot move right we are not owner of any more cell
                            nullIntervalFromLevel = idxLevel;
                            break;
                        }
301
                    }
302 303 304 305 306 307 308 309 310 311
                    // If we are responsible for some cells at this level keep the first index
                    if(nullIntervalFromLevel == 0){
                        myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                        octreeIterator.moveUp();
                        workingLimitAtLevel >>= 3;
                    }
                }
                // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
312 313 314
                }
            }

315 316
            // We get the leftIndex/rightIndex indexes from each procs
            FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
317 318 319 320 321 322 323 324 325 326 327 328
                                            workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()),  __LINE__ );
        }

        // run;
        if(operationsToProceed & FFmmP2M) bottomPass();

        if(operationsToProceed & FFmmM2M) upwardPass();

        if(operationsToProceed & FFmmM2L) transferPass();

        if(operationsToProceed & FFmmL2L) downardPass();

329
        if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
330

331

332
        // delete array
333 334 335 336
        delete []     iterArray;
        delete [] iterArrayComm;
        iterArray     = nullptr;
        iterArrayComm = nullptr;
337 338 339 340 341 342
    }

    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

343 344 345 346 347
    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
348
    void bottomPass(){
349 350
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
351
        FLOG(FTic computationCounter);
352
        typename OctreeClass::Iterator octreeIterator(tree);
353

354
        // Copy the ptr to leaves in array
355 356 357 358 359
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());
360

361
        FLOG(computationCounter.tic());
362
#pragma omp parallel
363
        {
364
            // Each thread get its own kernel
365
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
366
            // Parallel iteration on the leaves
367
#pragma omp for nowait
368 369 370 371 372
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
                myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
            }
        }
        FLOG(computationCounter.tac());
373
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
374
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
375 376 377 378 379
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////
380

381 382
    /** M2M */
    void upwardPass(){
383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

403 404 405
        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

406
        FSize bufferSize;
407 408
        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
409
        FSize recvBufferSize[7];
410
        CellClass recvBufferCells[7];
411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

        // We work from height-1 to 1
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 0 ; --idxLevel ){
            const int fackLevel = idxLevel + offsetRealTree;
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

435 436
            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
461
            #pragma omp parallel
462
            {
463
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
464
                //This single section post and receive the comms, and then do the M2M associated with it.
465
                #pragma omp single nowait
466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
495
                            bufferSize = sendBuffer.getSize();
496
                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
497
                                      FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
498 499
                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo,
500 501 502 503 504 505 506 507 508 509 510 511 512 513
                                      FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
514
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538
                                        idProcSource, FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
539 540
                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED,
541 542 543 544 545 546 547 548 549 550 551 552 553 554
                                        idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
                    }
555

556 557 558 559 560 561 562 563
                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
564
                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());
565 566

                            int position = 0;
567
                            int positionToInsert = 0;
568 569 570 571 572
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
573 574
                                FAssertLF(positionToInsert < 7);
                                FAssertLF(position < 8);
575
                                FAssertLF(!currentChild[position], "Already has a cell here");
576 577
                                recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
                                currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
578 579

                                packageFlags >>= 1;
580 581
                                position += 1;
                                positionToInsert += 1;
582
                            }
583 584

                            recvBuffer[idxProc].seek(0);
585 586
                        }
                        // Finally compute
587
                        myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, fackLevel);
588 589 590 591 592 593 594 595
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
596
                #pragma omp for nowait
597 598 599 600 601 602
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
                    myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), fackLevel);
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }
603

604 605 606 607
        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
608 609
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
610 611


612 613 614
        //////////////////////////////////////////////////////////////////
        //Periodicity
        //////////////////////////////////////////////////////////////////
615

616
        octreeIterator = typename OctreeClass::Iterator(tree);
617

618
        if( idProcess == 0){
619 620 621 622
            int iterRequestsSize = 0;

            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
                if( procHasWorkAtLevel(1,idxProc) ){
623
                    MPI_Irecv(&recvBufferSize[iterRequestsSize], 1, FMpi::GetType(recvBufferSize[iterRequestsSize]), idxProc,
624 625 626 627 628 629 630
                            FMpi::TagFmmM2MSize, comm.getComm(), &requests[iterRequestsSize]);
                    iterRequestsSize += 1;
                    FAssertLF(iterRequestsSize <= 7);
                }
            }
            MPI_Waitall( iterRequestsSize, requests, MPI_STATUSES_IGNORE);

631
            int iterRequests = 0;
632

633 634
            CellClass* currentChild[8];
            memcpy(currentChild, octreeIterator.getCurrentBox(), 8 * sizeof(CellClass*));
635

636
            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc ){
637
                if( procHasWorkAtLevel(1,idxProc) ){
638
                    recvBuffer[iterRequests].cleanAndResize(recvBufferSize[iterRequests]);
639 640
                    FAssertLF(recvBufferSize[iterRequests] < std::numeric_limits<int>::max());
                    MPI_Irecv(recvBuffer[iterRequests].data(), int(recvBufferSize[iterRequests]), MPI_BYTE, idxProc,
641 642 643
                            FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests]);
                    iterRequests += 1;
                    FAssertLF(iterRequests <= 7);
644 645
                }
            }
646

647
            MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE);
648

649
            // retreive data and merge my child and the child from others
650
            int counterProc = 0;
651
            for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
652
                if( procHasWorkAtLevel(1,idxProc) ){
653 654
                    recvBuffer[counterProc].seek(0);
                    int state = int(recvBuffer[counterProc].getValue<char>());
655

656 657 658 659 660 661 662
                    int position = 0;
                    while( state && position < 8){
                        while(!(state & 0x1)){
                            state >>= 1;
                            ++position;
                        }
                        FAssertLF(!currentChild[position], "Already has a cell here");
663

664
                        recvBufferCells[position].deserializeUp(recvBuffer[counterProc]);
665

666
                        currentChild[position] = (CellClass*) &recvBufferCells[position];
667

668 669 670
                        state >>= 1;
                        ++position;
                    }
671 672 673

                    FAssertLF(recvBuffer[counterProc].tell() == recvBufferSize[counterProc]);
                    counterProc += 1;
674 675
                }
            }
676

677
            (*kernels[0]).M2M( &rootCellFromProc , currentChild, offsetRealTree);
678

679 680 681
            processPeriodicLevels();
        }
        else {
682 683 684
            if( hasWorkAtLevel(1) ){
                const int firstChild = getWorkingInterval(1, idProcess).leftIndex & 7;
                const int lastChild  = getWorkingInterval(1, idProcess).rightIndex & 7;
685

686
                CellClass** child = octreeIterator.getCurrentBox();
687

688 689
                char state = 0;
                sendBuffer.write(state);
690

691 692 693 694 695 696 697
                for(int idxChild = firstChild ; idxChild <= lastChild ; ++idxChild){
                    if( child[idxChild] ){
                        child[idxChild]->serializeUp(sendBuffer);
                        state = char( state | (0x1 << idxChild));
                    }
                }
                sendBuffer.writeAt(0,state);
698 699 700
                FSize sizeToSend = sendBuffer.getSize();
                MPI_Send(&sizeToSend, 1, FMpi::GetType(sizeToSend), 0, FMpi::TagFmmM2MSize, comm.getComm());
                MPI_Send(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, 0, FMpi::TagFmmM2M, comm.getComm());
701 702
            }
        }
703 704 705 706 707 708
    }

    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////

709

710
    void transferPass(){
711 712 713 714 715 716 717 718 719 720 721 722 723
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
724
        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
725
        // index
726 727
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
728 729 730 731 732 733
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
734 735
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
736 737 738 739 740 741

        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);

        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
742

743
        #pragma omp parallel
744
        {
745
            #pragma omp master
746 747 748 749 750 751 752 753 754 755 756 757 758 759 760
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    ////octreeIterator.moveDown();
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
                    for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
761

762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785
                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
786 787
                        int neighborsPosition[/*189+26+1*/216];
                        MortonIndex neighborsIndexes[/*189+26+1*/216];
788 789 790 791
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
                            const int counter = getPeriodicInteractionNeighbors(iterArray[idxCell].getCurrentGlobalCoordinate(),
                                                                               idxLevel,
792
                                                                               neighborsIndexes, neighborsPosition, AllDirs, leafLevelSeparationCriteria);
793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
817 818 819 820 821
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
822
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
841
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, comm.getComm()),  __LINE__ );
842 843 844 845 846 847 848 849 850 851 852 853 854 855 856
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
                MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight];
                MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight];
                int iterRequest = 0;

                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
857
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
858
                        if(toSendAtProcAtLevel != 0){
859
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));
860

861 862 863
                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
864 865
                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
866 867 868 869 870
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

871 872
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

873
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max());
874
                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
875
                                             int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc,
876 877 878
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }

879
                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
880
                        if(toReceiveFromProcAtLevel){
881
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));
882

883
                            FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
884
                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
885
                                             int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc,
886 887 888 889 890 891 892 893 894 895
                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
896
                FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__);
897 898 899 900 901

                delete[] requests;
                delete[] status;

                FLOG(sendCounter.tac());
902
            }//End of Master region
903 904 905 906

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////
907

908
            #pragma omp single nowait
909 910 911 912 913 914 915 916
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                ////octreeIterator.moveDown();
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
                for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    const int fackLevel = idxLevel + offsetRealTree;
917

918
                    const int separationCriteria = (idxLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);
919

920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941
                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
                        const int chunckSize = FMath::Max(1, numberOfCells/(omp_get_num_threads()*omp_get_num_threads()));
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
942
                            #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
943 944 945 946 947 948
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                                const CellClass* neighbors[343];

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
949
                                    const int counter = tree->getPeriodicInteractionNeighbors(neighbors,
950
                                                                            iterArray[idxCellToCompute].getCurrentGlobalCoordinate(),
951
                                                                            idxLevel, AllDirs, separationCriteria);
952 953 954 955 956 957

                                    if(counter)
                                        myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, counter, fackLevel);
                                }
                            }
                        }
958
                    }//End of task spawning
959

960
                    #pragma omp taskwait
961

962
                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
963
                        #pragma omp task  default(none) firstprivate(idxThread,idxLevel)
964 965 966 967
                        {
                            kernels[idxThread]->finishedLevelM2L(fackLevel);
                        }
                    }
968
                    #pragma omp taskwait
969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984

                    FLOG(computationCounter.tac());
                }
            }
        }


        {
            FLOG(receiveCounter.tic());
            typename OctreeClass::Iterator octreeIterator(tree);
            ////octreeIterator.moveDown();
            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
            // compute the second time
            // for each levels
            for(int idxLevel = 1 ; idxLevel < OctreeHeight ; ++idxLevel ){
                const int fackLevel = idxLevel + offsetRealTree;
985

986
                const int separationCriteria = (fackLevel != OctreeHeight-1 ? 1 : leafLevelSeparationCriteria);
987

988 989 990 991 992 993 994 995 996
                if(!procHasWorkAtLevel(idxLevel, idProcess)){
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;
                    continue;
                }

                // put the received data into a temporary tree
                FLightOctree<CellClass> tempTree;
                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
997
                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
998
                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();
999

1000
                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
1001 1002 1003 1004
                            const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
                            const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
                            FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);

1005
                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();
1006

1007 1008 1009 1010 1011 1012
                            CellClass* const newCell = new CellClass;
                            newCell->setMortonIndex(cellIndex);
                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);

                            tempTree.insertCell(cellIndex, idxLevel, newCell);
                        }
1013

1014 1015
                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041
                    }
                }

                // take cells from our octree only if they are
                // linked to received data
                int numberOfCells = 0;
                int realCellId = 0;

                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                    octreeIterator.moveRight();
                }
                // for each cells
                do{
                    // copy cells that need data from others
                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
                        iterArray[numberOfCells++] = octreeIterator;
                    }
                } while(octreeIterator.moveRight());
                avoidGotoLeftIterator.moveDown();
                octreeIterator = avoidGotoLeftIterator;

                delete leafsNeedOther[idxLevel];
                leafsNeedOther[idxLevel] = nullptr;

                // Compute this cells
                FLOG(computationCounter.tic());
BRAMAS Berenger's avatar
BRAMAS Berenger committed </