FFmmAlgorithmThreadProc.hpp 87.5 KB
Newer Older
1
// See LICENCE file at project root
2 3
#ifndef FFMMALGORITHMTHREADPROC_HPP
#define FFMMALGORITHMTHREADPROC_HPP
4

5 6
#define SCALFMM_DISTRIBUTED_ALGORITHM

7

8
#include <algorithm>
9 10 11 12 13 14 15 16
#include <vector>
#include <memory>
//#include <sys/time.h>

//
#ifdef _OPENMP
#include <omp.h>
#endif
COULAUD Olivier's avatar
COULAUD Olivier committed
17
//
18 19 20
#include "Utils/FGlobal.hpp"
#include "Utils/FAssert.hpp"
#include "Utils/FLog.hpp"
21

22
#include "Utils/FTic.hpp"
23

24
#include "Utils/FEnv.hpp"
25
#include "Utils/FMpi.hpp"
26

27 28 29 30
#include "Containers/FVector.hpp"
#include "Containers/FBoolArray.hpp"
#include "Containers/FOctree.hpp"
#include "Containers/FLightOctree.hpp"
31

32 33
#include "Containers/FBufferWriter.hpp"
#include "Containers/FBufferReader.hpp"
34

35
#include "FCoreCommon.hpp"
36
#include "FP2PExclusion.hpp"
37

38 39
#include "Utils/FAlgorithmTimers.hpp"

40

41
/**
42
 * @author Berenger Bramas (berenger.bramas@inria.fr)
43
 *
44 45
 * Please read the license
 *
46 47 48 49
 * This class is a threaded FMM algorithm distributed using MPI. It iterates on
 * a tree and call the kernels with good arguments. It uses the inspector -
 * executor model : iterates on the tree and builds an array to work in parallel
 * on this array
50
 *
51
 * This class does not free pointers given in arguements.
52 53
 *
 * Threaded & based on the inspector-executor model
54 55 56 57 58 59
 *
 *     schedule(runtime) export OMP_NUM_THREADS=2
 *     export OMPI_CXX=`which g++-4.4`
 *     mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
 *        --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20
 *        --track-fds=yes ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
60
 */
61
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
62
class FFmmAlgorithmThreadProc : public FAbstractAlgorithm, public FAlgorithmTimers {
63

64
    using multipole_t       = typename CellClass::multipole_t;
65
    using local_expansion_t = typename CellClass::local_expansion_t;
66
    using symbolic_data_t   = CellClass;
67

68 69 70
private:
    OctreeClass* const tree;     ///< The octree to work on
    KernelClass** kernels;       ///< The kernels
71

72 73
    const int OctreeHeight;      ///< Tree height

74

75 76 77 78 79
    typename OctreeClass::Iterator* iterArray;  ///<  Used to store pointers to cells/leafs to work with
    typename OctreeClass::Iterator* iterArrayComm;  ///< Used to store pointers to cells/leafs to send/rcv

    const FMpi::FComm comm;      ///< MPI communicator
    FMpi::FComm fcomCompute;
berenger-bramas's avatar
berenger-bramas committed
80

81 82
    int numberOfLeafs;           ///< To store the size at the previous level
    const int MaxThreads;        ///< Max number of thread allowed by openmp
83 84 85 86
          int nbProcess;         ///< Process count
          int idProcess;         ///< Current process id
    const int nbProcessOrig;     ///< Process count
    const int idProcessOrig;     ///< Current process id
berenger-bramas's avatar
berenger-bramas committed
87

88
    const int userChunkSize;
89
    const int leafLevelSeparationCriteria;
90

91
    /** An interval is the morton index interval
92
     * that a proc uses (i.e. it holds data in this interval) */
93
    struct Interval{
94 95
        MortonIndex leftIndex;
        MortonIndex rightIndex;
96
    };
97 98

    /// Current process interval
99
    Interval*const intervals;
100
    /// All processes intervals
101
    Interval*const workingIntervalsPerLevel;
102

103
    /// Get an interval from a process id and tree level
104
    Interval& getWorkingInterval( int level,  int proc){
105
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
106
    }
107

108
    /// Get an interval from a process id and tree level
109
    const Interval& getWorkingInterval( int level,  int proc) const {
110
        return workingIntervalsPerLevel[OctreeHeight * proc + level];
111
    }
112 113
    /// Does \a procIdx have work at given \a idxLevel
    /** i.e. does it hold cells and is responsible of them ? */
114
    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
115
        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
116 117
    }

118
    /** True if the \a idxProc left cell at \a idxLevel+1 has the same parent as us for our right cell */
119
    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
120
        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
121 122
    }

123
    /** True if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
124
    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
125
        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
126 127
    }

128
public:
129 130 131 132 133
    ///
    /// \brief getWorkingInterval
    /// \param level level in th tree
    /// \return the interval from current process at tree level
    ///
134
    Interval& getWorkingInterval( int level){
135
        return getWorkingInterval(level, idProcess);
136
    }
137 138 139 140 141 142 143 144 145
    /// Get the Morton index Distribution at the leaf level
    ///
    /// Fill the vector mortonDistribution
    ///
    /// p = mpi process id then
    ///  Processor p owns indexes between [mortonLeafDistribution[2*p], mortonLeafDistribution[2*p]+1]
    ///
    /// parameter[out] mortonLeafDistribution
    ///
146 147 148 149 150
    void getMortonLeafDistribution(std::vector<MortonIndex> & mortonLeafDistribution) final {
      mortonLeafDistribution.resize(2*nbProcess) ;
      auto level =  OctreeHeight - 1;
      for (int p=0 ; p< nbProcess ; ++p ){
              auto inter = this->getWorkingInterval(level, p  );
151 152 153 154 155
              mortonLeafDistribution[2*p]   = inter.leftIndex;
              mortonLeafDistribution[2*p+1] = inter.rightIndex;
          }
    }

156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
//    /// Build and dill vector of the  MortonIndex Distribution at Leaf level
//    ///  p = mpi process id then
//    ///  [mortonLeafDistribution[2*p], mortonLeafDistribution[2*p+1] is the morton index shared by process p
//    void getMortonLeafDistribution(std::vector<MortonIndex> & mortonLeafDistribution) final {
//      mortonLeafDistribution.resize(2*nbProcess) ;
//      auto level =  OctreeHeight - 1;
//      for (int p=0 ; p< nbProcess ; ++p ){
//              auto inter = this->getWorkingInterval(level, p  );
//              mortonLeafDistribution[2*p]   = inter.leftIndex;
//              mortonLeafDistribution[2*p+1] = inter.rightIndex;
//          }

//    }
    ///
    /// \brief setKernel
    /// \param inKernels a pointer on the computational kernel used in the algorithm
    ///
    /// @todo move it in private section
    void setKernel(KernelClass*const inKernels){
      this->kernels = new KernelClass*[MaxThreads]{};
  #pragma omp parallel num_threads(MaxThreads)
      {
  #pragma omp critical (InitFFmmAlgorithmThreadProcPeriodic)
        {
          this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
        }
      }
    }
    ///
    /// \brief getPtrOnMortonIndexAtLeaf
    /// \return
    ///
188 189 190
    const MortonIndex * getPtrOnMortonIndexAtLeaf() {
      return &workingIntervalsPerLevel[0].leftIndex ;
    }
191 192 193 194 195
    ///
    /// \brief hasWorkAtLevel - Does the current process have some work at this level ?
    /// \param level
    /// \return true if the current process have some work at this level
      bool hasWorkAtLevel( int level){
196
        return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).rightIndex) < (getWorkingInterval(level, idProcess).rightIndex);
197 198
    }

199
    /**@brief Constructor
200 201
     * @param inTree the octree to work on
     * @param inKernels the kernels to call
202
     *
203 204
     * An assert is launched if one of the arguments is null
     */
205 206
    FFmmAlgorithmThreadProc(const FMpi::FComm& inComm, OctreeClass* const inTree,
                            KernelClass* const inKernels,
207 208
                            const int inUserChunkSize = 10,
                            const int inLeafLevelSeperationCriteria = 1) :
209 210
        tree(inTree),
        kernels(nullptr),
211
        OctreeHeight(tree->getHeight()),
212 213
        iterArray(nullptr),
        iterArrayComm(nullptr),
214 215
        comm(inComm),
        fcomCompute(inComm),
216
        numberOfLeafs(0),
217
        MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
218 219
        nbProcess(0),
        idProcess(0),
220 221
        nbProcessOrig(inComm.processCount()),
        idProcessOrig(inComm.processId()),
222
        userChunkSize(inUserChunkSize),
223
        leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
224 225
        intervals(new Interval[inComm.processCount()]),
        workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
226
        FAssertLF(tree, "tree cannot be null");
227
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
228

229
    //    this->setKernel(inKernels) ;
230
        this->kernels = new KernelClass*[MaxThreads];
231
#pragma omp parallel num_threads(MaxThreads)
232
        {
233
#pragma omp critical (InitFFmmAlgorithmThreadProc)
234
            {
235
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
236
            }
237
        }
238

239 240
        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());

241
        FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n");
242
        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcessOrig << ", I am " << idProcessOrig << ".\n");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
243
        FLOG(FLog::Controller << "Chunck Size = " << userChunkSize << "\n");
244
    }
245 246

    /// Default destructor
247
    virtual ~FFmmAlgorithmThreadProc(){
248 249 250 251
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
252

253 254
        delete [] intervals;
        delete [] workingIntervalsPerLevel;
255 256
    }

257
protected:
258 259 260 261
    /**
     * To execute the fmm algorithm
     * Call this function to run the complete algorithm
     */
262
    void executeCore(const unsigned operationsToProceed) override {
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281
        // We are not involve if the tree is empty
        const int iHaveParticles = (!tree->isEmpty());

        std::unique_ptr<int[]> hasParticles(new int[comm.processCount()]);
        FMpi::Assert( MPI_Allgather(const_cast<int*>(&iHaveParticles), 1,MPI_INT,
                                    hasParticles.get(), 1, MPI_INT,
                                    comm.getComm()), __LINE__);

        fcomCompute = FMpi::FComm(comm);
        fcomCompute.groupReduce(hasParticles.get());

        if(iHaveParticles){

            nbProcess = fcomCompute.processCount();
            idProcess = fcomCompute.processId();

            FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");

            // Count leaf
282
#ifdef SCALFMM_TRACE_ALGO
283
            eztrace_resume();
284
#endif
285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315
            this->numberOfLeafs = 0;
            {
                Interval myFullInterval;
                {//Building the interval with the first and last leaves (and count the number of leaves)
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
                    do{
                        ++this->numberOfLeafs;
                    } while(octreeIterator.moveRight());
                    myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
                }
                // Allocate a number to store the pointer of the cells at a level
                iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
                iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
                FAssertLF(iterArray,     "iterArray     bad alloc");
                FAssertLF(iterArrayComm, "iterArrayComm bad alloc");

                // We get the leftIndex/rightIndex indexes from each procs
                FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, fcomCompute.getComm()),  __LINE__ );

                // Build my intervals for all levels
                std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
                // At leaf level we know it is the full interval
                myIntervals[OctreeHeight - 1] = myFullInterval;

                // We can estimate the interval for each level by using the parent/child relation
                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
                    myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
                }
316

317 318 319 320 321 322 323
                // Process 0 uses the estimates as real intervals, but other processes
                // should remove cells that belong to others
                if(idProcess != 0){
                    //We test for each level if process on left (idProcess-1) own cell I thought I owned
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    octreeIterator.moveUp();
324

325 326
                    // At h-1 the working limit is the parent of the right cell of the proc on the left
                    MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;
327

328 329
                    // We check if we have no more work to do
                    int nullIntervalFromLevel = 0;
330

331 332 333 334 335 336 337 338 339 340 341 342 343
                    for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
                        while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
                            if( !octreeIterator.moveRight() ){
                                // We cannot move right we are not owner of any more cell
                                nullIntervalFromLevel = idxLevel;
                                break;
                            }
                        }
                        // If we are responsible for some cells at this level keep the first index
                        if(nullIntervalFromLevel == 0){
                            myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
                            octreeIterator.moveUp();
                            workingLimitAtLevel >>= 3;
344 345
                        }
                    }
346 347 348
                    // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
                    for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
                        myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
349 350 351
                    }
                }

352 353 354 355
                // We get the leftIndex/rightIndex indexes from each procs
                FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
                                                workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, fcomCompute.getComm()),  __LINE__ );
            }
COULAUD Olivier's avatar
COULAUD Olivier committed
356
#ifdef SCALFMM_TRACE_ALGO
357
            eztrace_enter_event("P2M", EZTRACE_YELLOW);
COULAUD Olivier's avatar
COULAUD Olivier committed
358
#endif
359 360 361
            Timers[P2MTimer].tic();
            if(operationsToProceed & FFmmP2M) bottomPass();
            Timers[P2MTimer].tac();
362

COULAUD Olivier's avatar
COULAUD Olivier committed
363
#ifdef SSCALFMM_TRACE_ALGO
364 365
            eztrace_leave_event();
            eztrace_enter_event("M2M", EZTRACE_PINK);
COULAUD Olivier's avatar
COULAUD Olivier committed
366 367
#endif

368 369 370
            Timers[M2MTimer].tic();
            if(operationsToProceed & FFmmM2M) upwardPass();
            Timers[M2MTimer].tac();
COULAUD Olivier's avatar
COULAUD Olivier committed
371 372

#ifdef SCALFMM_TRACE_ALGO
373 374
            eztrace_leave_event();
            eztrace_enter_event("M2L", EZTRACE_GREEN);
COULAUD Olivier's avatar
COULAUD Olivier committed
375
#endif
376

377 378 379
            Timers[M2LTimer].tic();
            if(operationsToProceed & FFmmM2L) transferPass();
            Timers[M2LTimer].tac();
380

381 382 383
#ifdef SCALFMM_TRACE_ALGO
            eztrace_leave_event();
            eztrace_enter_event("L2L", EZTRACE_PINK);
COULAUD Olivier's avatar
COULAUD Olivier committed
384 385
#endif

386 387 388
            Timers[L2LTimer].tic();
            if(operationsToProceed & FFmmL2L) downardPass();
            Timers[L2LTimer].tac();
389

COULAUD Olivier's avatar
COULAUD Olivier committed
390
#ifdef SCALFMM_TRACE_ALGO
391 392
            eztrace_leave_event();
            eztrace_enter_event("L2P+P2P", EZTRACE_BLUE);
COULAUD Olivier's avatar
COULAUD Olivier committed
393 394
#endif

395 396 397
            Timers[NearTimer].tic();
            if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
            Timers[NearTimer].tac();
398

399
#ifdef SCALFMM_TRACE_ALGO
400 401
            eztrace_leave_event();
            eztrace_stop();
402
#endif
403 404 405 406 407
            // delete array
            delete []     iterArray;
            delete []     iterArrayComm;
            iterArray          = nullptr;
            iterArrayComm = nullptr;
408
#ifdef SCALFMM_TRACE_ALGO
409
            eztrace_stop();
410
#endif
411 412 413 414
        }
        else{
            FLOG( FLog::Controller << "\tProcess = " << comm.processId() << " has zero particles.\n" );
        }
415
    }
416

417 418 419
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////
420

421 422 423 424 425
    /**
     * P2M Bottom Pass
     * No communication are involved in the P2M.
     * It is similar to multi threaded version.
     */
426
    void bottomPass(){
427 428 429 430 431 432 433 434 435 436 437 438 439
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        typename OctreeClass::Iterator octreeIterator(tree);

        // Copy the ptr to leaves in array
        octreeIterator.gotoBottomLeft();
        int leafs = 0;
        do{
            iterArray[leafs++] = octreeIterator;
        } while(octreeIterator.moveRight());

        FLOG(computationCounter.tic());
440
#pragma omp parallel num_threads(MaxThreads)
441 442 443 444
        {
            // Each thread get its own kernel
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            // Parallel iteration on the leaves
445
#pragma omp for nowait schedule(dynamic, userChunkSize)
446
            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
447 448

                multipole_t* leaf_multipole = &(iterArray[idxLeafs].getCurrentCell()->getMultipoleData());
449
                symbolic_data_t* leaf_symbolic  = iterArray[idxLeafs].getCurrentCell();
450 451 452 453 454 455 456 457

                auto particles = iterArray[idxLeafs].getCurrentListSrc();

                myThreadkernels->P2M(
                    leaf_multipole,
                    leaf_symbolic,
                    particles
                    );
458 459 460 461 462
            }
        }
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
463
        FLOG( FLog::Controller.flush());
464 465 466 467 468 469 470 471
    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

    /** M2M */
    void upwardPass(){
472 473 474 475 476 477 478 479 480 481
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic singleCounter);
        FLOG(FTic parallelCounter);

        // Start from leal level (height-1)
        typename OctreeClass::Iterator octreeIterator(tree);
        octreeIterator.gotoBottomLeft();
        octreeIterator.moveUp();
482

483
        for(int idxLevel = OctreeHeight - 2 ; idxLevel > FAbstractAlgorithm::lowerWorkingLevel-1 ; --idxLevel){
484 485 486
            octreeIterator.moveUp();
        }

487 488 489 490 491 492 493 494 495 496
        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);

        // The proc to send the shared cells to
        // Starting to the proc on the left this variable will go to 0
        int currentProcIdToSendTo = (idProcess - 1);

        // There are a maximum of 1 sends and 8-1 receptions
        MPI_Request requests[8];
        MPI_Status status[8];

497 498 499
        MPI_Request requestsSize[8];
        MPI_Status statusSize[8];

500
        FSize bufferSize;
501 502
        FBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7
        std::unique_ptr<FBufferReader[]> recvBuffer(new FBufferReader[7]);
503
        FSize recvBufferSize[7];
504
        CellClass recvBufferCells[7];
505 506 507 508 509 510

        // The first proc that send to me a cell
        // This variable will go to nbProcess
        int firstProcThatSend = idProcess + 1;
        FLOG(computationCounter.tic());

511 512
        // for each levels
        for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
            // Does my cells are covered by my neighbors working interval and so I have no more work?
            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
            if(noMoreWorkForMe){
                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
                break;
            }

            // Copy and count ALL the cells (even the ones outside the working interval)
            int totalNbCellsAtLevel = 0;
            do{
                iterArray[totalNbCellsAtLevel++] = octreeIterator;
            } while(octreeIterator.moveRight());
            avoidGotoLeftIterator.moveUp();
            octreeIterator = avoidGotoLeftIterator;

528 529
            int iterMpiRequests       = 0; // The iterator for send/recv requests
            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553

            int nbCellsToSkip     = 0; // The number of cells to send
            // Skip all the cells that are out of my working interval
            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
                ++nbCellsToSkip;
            }

            // We need to know if we will recv something in order to know if threads skip the last cell
            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
            bool hasToReceive = false;
            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
                // Find the first proc that may send to me
                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
                    firstProcThatSend += 1;
                }
                // Do we have to receive?
                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
                    hasToReceive = true;
                    // Threads do not compute the last cell, we will do it once data are received
                    nbCellsForThreads -= 1;
                }
            }

            FLOG(parallelCounter.tic());
554
#pragma omp parallel num_threads(MaxThreads)
555
            {
556
                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
557
                //This single section post and receive the comms, and then do the M2M associated with it.
558
#pragma omp single nowait
559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587
                {
                    FLOG(singleCounter.tic());
                    // Master proc never send
                    if(idProcess != 0){
                        // Skip process that have no work at that level
                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
                            --currentProcIdToSendTo;
                        }
                        // Does the next proc that has work is sharing the parent of my left cell
                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
                            FAssertLF(nbCellsToSkip != 0);

                            char packageFlags = 0;
                            sendBuffer.write(packageFlags);

                            // Only the cell the most on the right out of my working interval should be taken in
                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                // Check if child exists and it was part of my working interval
                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
                                    // Add the cell to the buffer
                                    child[idxChild]->serializeUp(sendBuffer);
                                    packageFlags = char(packageFlags | (0x1 << idxChild));
                                }
                            }
                            // Add the flag as first value
                            sendBuffer.writeAt(0,packageFlags);
                            // Post the message
588
                            bufferSize = sendBuffer.getSize();
589
                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
590
                                      FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
591
                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
BRAMAS Berenger's avatar
BRAMAS Berenger committed
592
                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo,
593
                                      FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
594 595 596 597 598 599 600 601 602 603 604 605 606
                        }
                    }

                    //Post receive, Datas needed in several parts of the section
                    int nbProcThatSendToMe = 0;

                    if(hasToReceive){
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
607
                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
608
                                          idProcSource, FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequestsSize){
                        FAssertLF(iterMpiRequestsSize <= 8);
                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
                    }

                    if(hasToReceive){
                        nbProcThatSendToMe = 0;
                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
                        int idProcSource = firstProcThatSend;
                        // Find the last proc that should send to me
                        while( idProcSource < nbProcess
                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
632
                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
BRAMAS Berenger's avatar
BRAMAS Berenger committed
633
                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE,
634
                                          idProcSource, FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
635 636 637 638 639 640 641 642 643 644 645 646
                                nbProcThatSendToMe += 1;
                                FAssertLF(nbProcThatSendToMe <= 7);
                            }
                            ++idProcSource;
                        }
                    }

                    //Wait For the comms, and do the work
                    // Are we sending or waiting anything?
                    if(iterMpiRequests){
                        FAssertLF(iterMpiRequests <= 8);
                        MPI_Waitall( iterMpiRequests, requests, status);
647
                    }
648

649 650 651 652 653 654 655
                    // We had received something so we need to proceed the last M2M
                    if( hasToReceive ){
                        FAssertLF(iterMpiRequests != 0);
                        CellClass* currentChild[8];
                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));

                        // Retreive data and merge my child and the child from others
656
                        int positionToInsert = 0;
657
                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
BRAMAS Berenger's avatar
BRAMAS Berenger committed
658
                            unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>());
659 660 661 662 663 664 665

                            int position = 0;
                            while( packageFlags && position < 8){
                                while(!(packageFlags & 0x1)){
                                    packageFlags >>= 1;
                                    ++position;
                                }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
666 667
                                FAssertLF(positionToInsert < 7);
                                FAssertLF(position < 8);
668
                                FAssertLF(!currentChild[position], "Already has a cell here");
BRAMAS Berenger's avatar
BRAMAS Berenger committed
669 670
                                recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
                                currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
671 672

                                packageFlags >>= 1;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
673 674
                                position += 1;
                                positionToInsert += 1;
675
                            }
676 677

                            recvBuffer[idxProc].seek(0);
678 679
                        }
                        // Finally compute
680 681 682

                        multipole_t* parent_multipole
                            = & iterArray[totalNbCellsAtLevel-1].getCurrentCell()->getMultipoleData();
683
                        const symbolic_data_t* parent_symbolic
684 685 686 687 688 689 690
                            = iterArray[totalNbCellsAtLevel-1].getCurrentCell();

                        std::array<const multipole_t*, 8> child_multipoles {};
                        std::transform(currentChild, currentChild+8, std::begin(child_multipoles),
                                       [](CellClass* c) {
                                           return c != nullptr ? &(c->getMultipoleData()) : nullptr;
                                       });
691
                        std::array<const symbolic_data_t*, 8> child_symbolics  {};
692 693 694 695 696 697 698 699
                        std::copy(currentChild, currentChild+8, std::begin(child_symbolics));

                        myThreadkernels->M2M(
                            parent_multipole,
                            parent_symbolic,
                            child_multipoles.data(),
                            child_symbolics.data()
                            );
700 701 702 703 704 705 706 707
                        firstProcThatSend += nbProcThatSendToMe - 1;
                    }
                    // Reset buffer
                    sendBuffer.reset();
                    FLOG(singleCounter.tac());
                }//End Of Single section

                // All threads proceed the M2M
708
#pragma omp for nowait schedule(dynamic, userChunkSize)
709
                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
710 711 712

                    multipole_t* parent_multipole
                        = &(iterArray[idxCell].getCurrentCell()->getMultipoleData());
713
                    const symbolic_data_t* parent_symbolic
714 715 716 717 718 719 720 721 722
                        = iterArray[idxCell].getCurrentCell();

                    const auto* children = iterArray[idxCell].getCurrentChild();

                    std::array<const multipole_t*, 8> child_multipoles {};
                    std::transform(children, children+8, std::begin(child_multipoles),
                                   [](CellClass* c) {
                                       return c != nullptr ? &(c->getMultipoleData()) : nullptr;
                                   });
723
                    std::array<const symbolic_data_t*, 8> child_symbolics  {};
724 725 726 727 728 729 730 731
                    std::copy(children, children+8, std::begin(child_symbolics));

                    myThreadkernels->M2M(
                        parent_multipole,
                        parent_symbolic,
                        child_multipoles.data(),
                        child_symbolics.data()
                        );
732 733 734 735 736 737 738 739 740 741 742
                }
            }//End of parallel section
            FLOG(parallelCounter.tac());
        }

        FLOG(counterTime.tac());
        FLOG(computationCounter.tac());
        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
BRAMAS Berenger's avatar
BRAMAS Berenger committed
743
        FLOG( FLog::Controller.flush());
744
    }
745

746 747 748
    /////////////////////////////////////////////////////////////////////////////
    // Downard
    /////////////////////////////////////////////////////////////////////////////
749 750


751
    void transferPass(){
752 753 754 755 756 757 758 759 760 761 762 763 764
        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic sendCounter);
        FLOG(FTic receiveCounter);
        FLOG(FTic prepareCounter);
        FLOG(FTic gatherCounter);

        //////////////////////////////////////////////////////////////////
        // First know what to send to who
        //////////////////////////////////////////////////////////////////

        // pointer to send
765
        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
766
        // index
767 768
        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
769 770 771 772 773 774
        // To know which one has need someone
        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);

        // All process say to each others
        // what the will send to who
775 776
        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
777

778 779
        FBufferWriter**const sendBuffer = new FBufferWriter*[nbProcess * OctreeHeight];
        memset(sendBuffer, 0, sizeof(FBufferWriter*) * nbProcess * OctreeHeight);
780

781 782
        FBufferReader**const recvBuffer = new FBufferReader*[nbProcess * OctreeHeight];
        memset(recvBuffer, 0, sizeof(FBufferReader*) * nbProcess * OctreeHeight);
783

784
#pragma omp parallel num_threads(MaxThreads)
785
        {
786
#pragma omp master
787 788 789 790 791 792 793 794 795 796 797 798
            {
                {
                    FLOG(prepareCounter.tic());

                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);

                    // To know if a leaf has been already sent to a proc
                    bool*const alreadySent = new bool[nbProcess];
                    memset(alreadySent, 0, sizeof(bool) * nbProcess);

                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.moveDown();
799

800
                    for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
801 802 803
                        octreeIterator.moveDown();
                    }

804 805
                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                    // for each levels
806
                    for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
807

808
                        const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
809

810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833
                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
                            avoidGotoLeftIterator.moveDown();
                            octreeIterator = avoidGotoLeftIterator;
                            continue;
                        }

                        int numberOfCells = 0;

                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                            octreeIterator.moveRight();
                        }

                        // for each cells
                        do{
                            iterArrayLocal[numberOfCells] = octreeIterator;
                            ++numberOfCells;
                        } while(octreeIterator.moveRight());
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;

                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);

                        // Which cell potentialy needs other data and in the same time
                        // are potentialy needed by other
834
                        MortonIndex neighborsIndexes[/*189+26+1*/216];
835 836
                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
                            // Find the M2L neigbors of a cell
837
                            const int counter = iterArrayLocal[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes, separationCriteria);
838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861

                            memset(alreadySent, false, sizeof(bool) * nbProcess);
                            bool needOther = false;
                            // Test each negibors to know which one do not belong to us
                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
                                    int procToReceive = idProcess;
                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
                                        --procToReceive;
                                    }
                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
                                        ++procToReceive;
                                    }
                                    // Maybe already sent to that proc?
                                    if( !alreadySent[procToReceive]
                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){

                                        alreadySent[procToReceive] = true;

                                        needOther = true;

                                        toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
862 863 864
                                        if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
                                            indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
                                        }
865
                                        indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getMultipoleData().getSavedSize();
866
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
867
                                        indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885
                                    }
                                }
                            }
                            if(needOther){
                                leafsNeedOther[idxLevel]->set(idxCell,true);
                            }
                        }
                    }
                    FLOG(prepareCounter.tac());

                    delete[] alreadySent;
                }

                //////////////////////////////////////////////////////////////////
                // Gather this information
                //////////////////////////////////////////////////////////////////

                FLOG(gatherCounter.tic());
886
                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, fcomCompute.getComm()),  __LINE__ );
887 888 889 890 891 892 893 894 895
                FLOG(gatherCounter.tac());

                //////////////////////////////////////////////////////////////////
                // Send and receive for real
                //////////////////////////////////////////////////////////////////

                FLOG(sendCounter.tic());
                // Then they can send and receive (because they know what they will receive)
                // To send in asynchrone way
BRAMAS Berenger's avatar
BRAMAS Berenger committed
896 897
                std::vector<MPI_Request> requests;
                requests.reserve(2 * nbProcess * OctreeHeight);
898 899 900

                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
901
                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
902
                        if(toSendAtProcAtLevel != 0){
903
                            sendBuffer[idxLevel * nbProcess + idxProc] = new FBufferWriter(toSendAtProcAtLevel);
904

905 906 907
                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));

                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
908 909
                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
910 911 912 913 914
                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
                            }

915 916
                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);

BRAMAS Berenger's avatar
BRAMAS Berenger committed
917 918
                            FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(),
                                    sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc,
919
                                    FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
920 921
                        }

922
                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
923
                        if(toReceiveFromProcAtLevel){
924
                            recvBuffer[idxLevel * nbProcess + idxProc] = new FBufferReader(toReceiveFromProcAtLevel);
925

BRAMAS Berenger's avatar
BRAMAS Berenger committed
926 927
                            FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
                                    recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc,
928
                                    FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
929 930 931 932 933 934 935 936 937
                        }
                    }
                }

                //////////////////////////////////////////////////////////////////
                // Wait received data and compute
                //////////////////////////////////////////////////////////////////

                // Wait to receive every things (and send every things)
BRAMAS Berenger's avatar
BRAMAS Berenger committed
938
                FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__);
939 940 941 942 943 944 945

                FLOG(sendCounter.tac());
            }//End of Master region

            //////////////////////////////////////////////////////////////////
            // Do M2L
            //////////////////////////////////////////////////////////////////
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
946

947
#pragma omp single nowait
948 949 950
            {
                typename OctreeClass::Iterator octreeIterator(tree);
                octreeIterator.moveDown();
951

952
                for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
953 954 955
                    octreeIterator.moveDown();
                }

956 957 958
                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
                // Now we can compute all the data
                // for each levels
959
                for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
960
                    const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
961

962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981
                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
                        avoidGotoLeftIterator.moveDown();
                        octreeIterator = avoidGotoLeftIterator;
                        continue;
                    }

                    int numberOfCells = 0;
                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
                        octreeIterator.moveRight();
                    }
                    // for each cells
                    do{
                        iterArray[numberOfCells] = octreeIterator;
                        ++numberOfCells;
                    } while(octreeIterator.moveRight());
                    avoidGotoLeftIterator.moveDown();
                    octreeIterator = avoidGotoLeftIterator;

                    FLOG(computationCounter.tic());
                    {
982
                        const int chunckSize = userChunkSize;
983
                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
984
#pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
985 986
                            {
                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
987 988
                                const CellClass* neighbors[342] {};
                                int neighborPositions[342] {};
989 990

                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
991 992 993 994 995 996 997 998 999
                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute) {
                                    const int counter = tree->getInteractionNeighbors(
                                        neighbors,
                                        neighborPositions,
                                        iterArray[idxCellToCompute].getCurrentGlobalCoordinate(),
                                        idxLevel,
                                        separationCriteria
                                        );
                                    if(counter) {
1000
                                        local_expansion_t* target_local_expansion
1001
                                            = &(iterArray[idxCellToCompute].getCurrentCell()->getLocalExpansionData());
1002
                                        const symbolic_data_t* target_symbolic
1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013
                                            = iterArray[idxCellToCompute].getCurrentCell();

                                        std::array<const multipole_t*, 342> source_multipoles {};
                                        std::transform(std::begin(neighbors), std::end(neighbors),
                                                       std::begin(source_multipoles),
                                                       [](const CellClass* c) {
                                                           return ((c != nullptr)
                                                                   ? &(c->getMultipoleData())
                                                                   : nullptr);
                                                       });

1014
                                        std::array<const symbolic_data_t*, 342> source_symbolics {};
1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026
                                        std::copy(std::begin(neighbors), std::end(neighbors),
                                                  std::begin(source_symbolics));

                                        myThreadkernels->M2L(
                                            target_local_expansion,
                                            target_symbolic,
                                            source_multipoles.data(),
                                            source_symbolics.data(),
                                            neighborPositions,
                                            counter
                                            );
                                    }
1027 1028 1029 1030
                                }
                            }
                        }
                    }//End of task spawning
PIACIBELLO Cyrille's avatar
PIACIBELLO Cyrille committed
1031