FFmmAlgorithmThreadBalance.hpp 32.3 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13
#ifndef FFmmAlgorithmThreadBalanceBALANCE_HPP
#define FFmmAlgorithmThreadBalanceBALANCE_HPP

#include "../Utils/FAssert.hpp"
#include "../Utils/FLog.hpp"

#include "../Utils/FTic.hpp"
#include "../Utils/FGlobal.hpp"
#include "Utils/FAlgorithmTimers.hpp"

#include "../Containers/FOctree.hpp"

#include "FCoreCommon.hpp"
14
#include "FP2PExclusion.hpp"
15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32

#include <omp.h>
#include <vector>
#include <memory>

/**
* \author Berenger Bramas (berenger.bramas@inria.fr)
* \brief Implements an FMM algorithm threaded using OpenMP.
*
* Please read the license
*
* This class runs a threaded FMM algorithm.
* It balance the execution between threads.
*
* When using this algorithm the P2P is thread safe.
*
* This class does not deallocate pointers given to its constructor.
*/
33
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
34 35 36 37
class FFmmAlgorithmThreadBalance : public FAbstractAlgorithm, public FAlgorithmTimers{
    OctreeClass* const tree;                  ///< The octree to work on.
    KernelClass** kernels;                    ///< The kernels.

38
    static const int SizeShape = P2PExclusionClass::SizeShape;
39 40 41 42 43

    const int MaxThreads;                     ///< The maximum number of threads.

    const int OctreeHeight;                   ///< The height of the given tree.

44
    const int leafLevelSeparationCriteria;
45 46 47 48 49 50 51 52 53 54 55 56 57

public:
    /** Class constructor
     *
     * The constructor needs the octree and the kernels used for computation.
     * \param inTree the octree to work on.
     * \param inKernels the kernels to call.
     * \param inUserChunckSize To specify the chunck size in the loops (-1 is static, 0 is N/p^2, otherwise it
     * directly used as the number of item to proceed together), default is 10
     *
     * \except An exception is thrown if one of the arguments is NULL.
     */
    FFmmAlgorithmThreadBalance(OctreeClass* const inTree, KernelClass* const inKernels,
58
                               const int inLeafLevelSeperationCriteria = 1)
59 60
        : tree(inTree) , kernels(nullptr),
          MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()),
61
          leafLevelSeparationCriteria(inLeafLevelSeperationCriteria) {
62
        FAssertLF(tree, "tree cannot be null");
63
        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
64 65

        this->kernels = new KernelClass*[MaxThreads];
66 67 68
        #pragma omp parallel num_threads(MaxThreads)
        {
            #pragma omp critical (InitFFmmAlgorithmThreadBalance)
69
            {
70
                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158
            }
        }

        FAbstractAlgorithm::setNbLevelsInTree(OctreeHeight);
        buildThreadIntervals();

        FLOG(FLog::Controller << "FFmmAlgorithmThreadBalance (Max Thread " << omp_get_max_threads() << ")\n");
    }

    /** Default destructor */
    virtual ~FFmmAlgorithmThreadBalance(){
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete this->kernels[idxThread];
        }
        delete [] this->kernels;
    }

    /**
      * Runs the complete algorithm.
      */
    void executeCore(const unsigned operationsToProceed) override {

        Timers[P2MTimer].tic();
        if(operationsToProceed & FFmmP2M) bottomPass();
        Timers[P2MTimer].tac();

        Timers[M2MTimer].tic();
        if(operationsToProceed & FFmmM2M) upwardPass();
        Timers[M2MTimer].tac();

        Timers[M2LTimer].tic();
        if(operationsToProceed & FFmmM2L) transferPass();
        Timers[M2LTimer].tac();

        Timers[L2LTimer].tic();
        if(operationsToProceed & FFmmL2L) downardPass();
        Timers[L2LTimer].tac();

        Timers[NearTimer].tic();
        if(operationsToProceed & FFmmL2P) L2P();
        if(operationsToProceed & FFmmP2P) directPass();
        Timers[NearTimer].tac();
    }

protected:
    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////
    /** The workload contains what a thread need to perfom its interval of work */
    struct Workload{
        typename OctreeClass::Iterator iterator;
        int nbElements;
    };

    //< The work per thread for the P2M
    std::vector<Workload> workloadP2M;
    //< The work per level and per thread for the M2M
    std::vector<std::vector<Workload>> workloadM2M;
    //< The work per level and per thread for the M2L
    std::vector<std::vector<Workload>> workloadM2L;
    //< The work per level and per thread for the L2L
    std::vector<std::vector<Workload>> workloadL2L;
    //< The work per thread for the L2P
    std::vector<Workload> workloadL2P;
    //< The work per shape and per thread for the P2P
    std::vector<std::vector<std::pair<int,int>>> workloadP2P;

    /** This structure is needed by the thread for the P2P because of the colors */
    struct LeafData{
        MortonIndex index;
        FTreeCoordinate coord;
        ContainerClass* targets;
        ContainerClass* sources;
    };
    /** Direct access to the data for the P2P */
    std::unique_ptr<LeafData[]> leafsDataArray;

    /** This struct is used during the preparation of the interval */
    struct WorkloadTemp{
        typename OctreeClass::Iterator iterator;
        FSize amountOfWork;
    };

    /** From a vector of work (workPerElement) generate the interval */
    void generateIntervalFromWorkload(std::vector<Workload>* intervals, const FSize totalWork,
                                      WorkloadTemp* workPerElement, const FSize nbElements) const {
        // Now split between thread
        (*intervals).resize(MaxThreads);
159

160 161
        // Ideally each thread will have this
        const FSize idealWork = (totalWork/MaxThreads);
162 163
        ///FLOG(FLog::Controller << "[Balance] idealWork " << idealWork << "\n");

164 165 166 167 168
        // Assign default value for first thread
        int idxThread = 0;
        (*intervals)[idxThread].iterator = workPerElement[0].iterator;
        (*intervals)[idxThread].nbElements = 1;
        FSize assignWork = workPerElement[0].amountOfWork;
169

170
        for(int idxElement = 1 ; idxElement < nbElements ; ++idxElement){
171 172
            ///FLOG(FLog::Controller << "[Balance] idxElement " << workPerElement[idxElement].amountOfWork << "\n");
            ///FLOG(FLog::Controller << "[Balance] assignWork " << assignWork << "\n");
173 174 175 176
            // is it more balance if we add the current element to the current thread
            if(FMath::Abs((idxThread+1)*idealWork - assignWork) <
                    FMath::Abs((idxThread+1)*idealWork - assignWork - workPerElement[idxElement].amountOfWork)
                    && idxThread != MaxThreads-1){
177 178
                ///FLOG(FLog::Controller << "[Balance] Shape Thread " << idxThread << " goes from "
                ///      << (*intervals)[idxThread].iterator.getCurrentGlobalIndex() << " nb " << (*intervals)[idxThread].nbElements << "/" << nbElements << "\n");
179 180 181 182 183 184 185 186
                // if not start filling the next thread
                idxThread += 1;
                (*intervals)[idxThread].iterator = workPerElement[idxElement].iterator;
                (*intervals)[idxThread].nbElements = 0;
            }
            (*intervals)[idxThread].nbElements += 1;
            assignWork += workPerElement[idxElement].amountOfWork;
        }
187 188 189

        ///FLOG(FLog::Controller << "[Balance] Shape Thread " << idxThread << " goes from "
        ///      << (*intervals)[idxThread].iterator.getCurrentGlobalIndex() << " nb " << (*intervals)[idxThread].nbElements << "/" << nbElements << "\n");
190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209
    }

    void buildThreadIntervals(){
        // Reset the vectors
        workloadP2M.clear();
        workloadM2M.clear();
        workloadM2L.clear();
        workloadL2L.clear();
        workloadL2P.clear();
        workloadP2P.clear();

        // Count the number of leaves and color elements
        int shapeLeaves[SizeShape] = {0};
        int leafsNumber = 0;
        {
            typename OctreeClass::Iterator octreeIterator(tree);
            octreeIterator.gotoBottomLeft();
            do{
                ++leafsNumber;
                const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
210
                ++shapeLeaves[P2PExclusionClass::GetShapeIdx(coord)];
211 212 213 214
            } while(octreeIterator.moveRight());
        }

        // Allocate the working buffer
BRAMAS Berenger's avatar
BRAMAS Berenger committed
215 216
        std::unique_ptr<WorkloadTemp*[]> workloadBufferThread(new WorkloadTemp*[MaxThreads]);
        memset(workloadBufferThread.get(), 0, MaxThreads*sizeof(WorkloadTemp*));
217

BRAMAS Berenger's avatar
BRAMAS Berenger committed
218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245
        #pragma omp parallel
        {
            #pragma omp single
            {
                #pragma omp task
                { // Prepare P2M
                    if(workloadBufferThread[omp_get_thread_num()] == nullptr){
                        workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
                    }
                    WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];

                    /// FLOG(FLog::Controller << "[Balance] P2M:\n");
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    FSize idxLeaf = 0;
                    FSize totalWork = 0;
                    do{
                        // Keep track of tree iterator
                        workloadBuffer[idxLeaf].iterator = octreeIterator;
                        // Count the nb of particles as amount of work in the leaf
                        workloadBuffer[idxLeaf].amountOfWork = octreeIterator.getCurrentListSrc()->getNbParticles();
                        // Keep the total amount of work
                        totalWork += workloadBuffer[idxLeaf].amountOfWork;
                        ++idxLeaf;
                    } while(octreeIterator.moveRight());

                    generateIntervalFromWorkload(&workloadP2M, totalWork, workloadBuffer, idxLeaf);
                }
246

BRAMAS Berenger's avatar
BRAMAS Berenger committed
247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
                #pragma omp task
                { // Prepare L2P
                    if(workloadBufferThread[omp_get_thread_num()] == nullptr){
                        workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
                    }
                    WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];
                    /// FLOG(FLog::Controller << "[Balance] L2P:\n");
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
                    FSize idxLeaf = 0;
                    FSize totalWork = 0;
                    do{
                        // Keep track of tree iterator
                        workloadBuffer[idxLeaf].iterator = octreeIterator;
                        // Count the nb of particles as amount of work in the leaf
                        workloadBuffer[idxLeaf].amountOfWork = octreeIterator.getCurrentListTargets()->getNbParticles();
                        // Keep the total amount of work
                        totalWork += workloadBuffer[idxLeaf].amountOfWork;
                        ++idxLeaf;
                    } while(octreeIterator.moveRight());

                    generateIntervalFromWorkload(&workloadL2P, totalWork, workloadBuffer, idxLeaf);
                }
270

BRAMAS Berenger's avatar
BRAMAS Berenger committed
271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343
                #pragma omp task
                {// Do it for the M2L
                    if(workloadBufferThread[omp_get_thread_num()] == nullptr){
                        workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
                    }
                    WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];
                    /// FLOG(FLog::Controller << "[Balance] M2L:\n");
                    workloadM2L.resize(OctreeHeight);
                    typename OctreeClass::Iterator avoidGotoLeftIterator(tree);
                    avoidGotoLeftIterator.gotoBottomLeft();

                    const CellClass* neighbors[343];

                    for(int idxLevel = OctreeHeight-1 ; idxLevel >= 2 ; --idxLevel){
                        /// FLOG(FLog::Controller << "[Balance] \t level " << idxLevel << ":\n");
                        typename OctreeClass::Iterator octreeIterator(avoidGotoLeftIterator);
                        avoidGotoLeftIterator.moveUp();

                        FSize idxCell = 0;
                        FSize totalWork = 0;
                        do{
                            // Keep track of tree iterator
                            workloadBuffer[idxCell].iterator = octreeIterator;
                            // Count the nb of M2L for this cell
                            workloadBuffer[idxCell].amountOfWork = tree->getInteractionNeighbors(neighbors, octreeIterator.getCurrentGlobalCoordinate(), idxLevel, 1);
                            // Keep the total amount of work
                            totalWork += workloadBuffer[idxCell].amountOfWork;
                            ++idxCell;
                        } while(octreeIterator.moveRight());

                        // Now split between thread
                        generateIntervalFromWorkload(&workloadM2L[idxLevel], totalWork, workloadBuffer, idxCell);
                    }
                }
                #pragma omp task
                {// Do it for the M2M L2L
                    if(workloadBufferThread[omp_get_thread_num()] == nullptr){
                        workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
                    }
                    WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];
                    /// FLOG(FLog::Controller << "[Balance] M2M L2L:\n");
                    workloadM2M.resize(OctreeHeight);
                    workloadL2L.resize(OctreeHeight);
                    typename OctreeClass::Iterator avoidGotoLeftIterator(tree);
                    avoidGotoLeftIterator.gotoBottomLeft();
                    avoidGotoLeftIterator.moveUp();

                    for(int idxLevel = OctreeHeight-2 ; idxLevel >= 2 ; --idxLevel){
                        /// FLOG(FLog::Controller << "[Balance] \t level " << idxLevel << ":\n");
                        typename OctreeClass::Iterator octreeIterator(avoidGotoLeftIterator);
                        avoidGotoLeftIterator.moveUp();

                        FSize idxCell = 0;
                        FSize totalWork = 0;
                        do{
                            // Keep track of tree iterator
                            workloadBuffer[idxCell].iterator = octreeIterator;
                            // Count the nb of children of the current cell
                            workloadBuffer[idxCell].amountOfWork = 0;
                            CellClass** child = octreeIterator.getCurrentChild();
                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                                if(child[idxChild]) workloadBuffer[idxCell].amountOfWork += 1;
                            }
                            // Keep the total amount of work
                            totalWork += workloadBuffer[idxCell].amountOfWork;
                            ++idxCell;
                        } while(octreeIterator.moveRight());

                        // Now split between thread
                        generateIntervalFromWorkload(&workloadM2M[idxLevel], totalWork, workloadBuffer, idxCell);
                        generateIntervalFromWorkload(&workloadL2L[idxLevel], totalWork, workloadBuffer, idxCell);
                    }
                }
344

BRAMAS Berenger's avatar
BRAMAS Berenger committed
345 346 347 348 349 350
                #pragma omp task
                {
                    if(workloadBufferThread[omp_get_thread_num()] == nullptr){
                        workloadBufferThread[omp_get_thread_num()] = new WorkloadTemp[leafsNumber];
                    }
                    WorkloadTemp* workloadBuffer = workloadBufferThread[omp_get_thread_num()];
351
                    memset(workloadBuffer, 0, sizeof(struct WorkloadTemp)*leafsNumber);
BRAMAS Berenger's avatar
BRAMAS Berenger committed
352 353 354 355 356 357 358 359
                    // Prepare the P2P
                    const int LeafIndex = OctreeHeight - 1;
                    leafsDataArray.reset(new LeafData[leafsNumber]);

                    // We need the offset for each color
                    int startPosAtShape[SizeShape] = {0};
                    for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
                        startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaves[idxShape-1];
360 361
                    }

BRAMAS Berenger's avatar
BRAMAS Berenger committed
362 363 364
                    // Prepare each color
                    typename OctreeClass::Iterator octreeIterator(tree);
                    octreeIterator.gotoBottomLeft();
365

BRAMAS Berenger's avatar
BRAMAS Berenger committed
366
                    FSize workPerShape[SizeShape] = {0};
367

BRAMAS Berenger's avatar
BRAMAS Berenger committed
368 369 370
                    // for each leafs
                    for(int idxLeaf = 0 ; idxLeaf < leafsNumber ; ++idxLeaf){
                        const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
371
                        const int shapePosition = P2PExclusionClass::GetShapeIdx(coord);
372

BRAMAS Berenger's avatar
BRAMAS Berenger committed
373
                        const int positionToWork = startPosAtShape[shapePosition]++;
374

BRAMAS Berenger's avatar
BRAMAS Berenger committed
375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405
                        leafsDataArray[positionToWork].index   = octreeIterator.getCurrentGlobalIndex();
                        leafsDataArray[positionToWork].coord   = coord;
                        leafsDataArray[positionToWork].targets = octreeIterator.getCurrentListTargets();
                        leafsDataArray[positionToWork].sources = octreeIterator.getCurrentListSrc();

                        // For now the cost is simply based on the number of particles
                        const FSize nbPartInLeaf = octreeIterator.getCurrentListTargets()->getNbParticles();
                        workloadBuffer[positionToWork].amountOfWork = nbPartInLeaf*nbPartInLeaf;
                        ContainerClass* neighbors[27];
                        tree->getLeafsNeighbors(neighbors, octreeIterator.getCurrentGlobalCoordinate(), LeafIndex);
                        for(int idxNeigh = 0 ; idxNeigh < 27 ; ++idxNeigh){
                            if(neighbors[idxNeigh]){
                                workloadBuffer[positionToWork].amountOfWork +=
                                         nbPartInLeaf * neighbors[idxNeigh]->getNbParticles();
                            }
                        }

                        workPerShape[shapePosition] += workloadBuffer[positionToWork].amountOfWork;

                        octreeIterator.moveRight();
                    }

                    workloadP2P.resize(SizeShape);
                    int offsetShape = 0;

                    for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
                        std::vector<std::pair<int,int>>* intervals = &workloadP2P[idxShape];
                        const int nbElements = shapeLeaves[idxShape];
                        const FSize totalWork = workPerShape[idxShape];

                        // Now split between thread
406
                        (*intervals).resize(MaxThreads, std::pair<int,int>(0,0));
BRAMAS Berenger's avatar
BRAMAS Berenger committed
407 408 409 410 411
                        // Ideally each thread will have this
                        const FSize idealWork = (totalWork/MaxThreads);
                        // Assign default value for first thread
                        int idxThread = 0;
                        (*intervals)[idxThread].first = offsetShape;
412
                        FSize assignWork = workloadBuffer[offsetShape].amountOfWork;
BRAMAS Berenger's avatar
BRAMAS Berenger committed
413 414 415 416 417 418 419 420 421 422 423
                        for(int idxElement = 1+offsetShape ; idxElement < nbElements+offsetShape ; ++idxElement){
                            if(FMath::Abs((idxThread+1)*idealWork - assignWork) <
                                    FMath::Abs((idxThread+1)*idealWork - assignWork - workloadBuffer[idxElement].amountOfWork)
                                    && idxThread != MaxThreads-1){
                                (*intervals)[idxThread].second = idxElement;
                                idxThread += 1;
                                (*intervals)[idxThread].first = idxElement;
                            }
                            assignWork += workloadBuffer[idxElement].amountOfWork;
                        }
                        (*intervals)[idxThread].second = nbElements + offsetShape;
424

425 426 427 428 429 430
                        idxThread += 1;
                        while(idxThread != MaxThreads){
                            (*intervals)[idxThread].first = nbElements+offsetShape;
                            (*intervals)[idxThread].second = nbElements+offsetShape;
                            idxThread += 1;
                        }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
431 432

                        offsetShape += nbElements;
433 434
                    }
                }
BRAMAS Berenger's avatar
BRAMAS Berenger committed
435
            }
436

BRAMAS Berenger's avatar
BRAMAS Berenger committed
437
            #pragma omp taskwait
BRAMAS Berenger's avatar
BRAMAS Berenger committed
438
        }
439

BRAMAS Berenger's avatar
BRAMAS Berenger committed
440 441
        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
            delete[] workloadBufferThread[idxThread];
442 443 444 445 446 447 448 449 450 451 452 453 454 455
        }
    }


    /////////////////////////////////////////////////////////////////////////////
    // P2M
    /////////////////////////////////////////////////////////////////////////////

    /** Runs the P2M kernel. */
    void bottomPass(){
        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
        FLOG(FTic counterTime);

        FLOG(FTic computationCounter);
456
        #pragma omp parallel
457 458 459 460 461 462 463 464 465 466 467
        {
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            const int nbCellsToCompute = workloadP2M[omp_get_thread_num()].nbElements;
            typename OctreeClass::Iterator octreeIterator(workloadP2M[omp_get_thread_num()].iterator);

            for(int idxLeafs = 0 ; idxLeafs < nbCellsToCompute ; ++idxLeafs){
                // We need the current cell that represent the leaf
                // and the list of particles
                myThreadkernels->P2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentListSrc());
                octreeIterator.moveRight();
            }
468 469 470 471

            FAssertLF(omp_get_thread_num() == MaxThreads-1
                      || workloadP2M[omp_get_thread_num()+1].nbElements == 0
                      || octreeIterator.getCurrentGlobalIndex() == workloadP2M[omp_get_thread_num()+1].iterator.getCurrentGlobalIndex());
472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494
        }
        FLOG(computationCounter.tac() );

        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );

    }

    /////////////////////////////////////////////////////////////////////////////
    // Upward
    /////////////////////////////////////////////////////////////////////////////

    /** Runs the M2M kernel. */
    void upwardPass(){
        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);

        // for each levels
        for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
            FLOG(FTic counterTimeLevel);

            FLOG(computationCounter.tic());
495
            #pragma omp parallel
496 497 498 499 500 501 502 503 504 505 506
            {
                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                const int nbCellsToCompute = workloadM2M[idxLevel][omp_get_thread_num()].nbElements;
                typename OctreeClass::Iterator octreeIterator( workloadM2M[idxLevel][omp_get_thread_num()].iterator);

                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
                    // We need the current cell and the child
                    // child is an array (of 8 child) that may be null
                    myThreadkernels->M2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
                    octreeIterator.moveRight();
                }
507 508 509 510

                FAssertLF(omp_get_thread_num() == MaxThreads-1
                          || workloadM2M[idxLevel][omp_get_thread_num()+1].nbElements == 0
                          || octreeIterator.getCurrentGlobalIndex() == workloadM2M[idxLevel][omp_get_thread_num()+1].iterator.getCurrentGlobalIndex());
511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527
            }

            FLOG(computationCounter.tac());
            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
        }


        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );

    }

    /////////////////////////////////////////////////////////////////////////////
    // Transfer
    /////////////////////////////////////////////////////////////////////////////

    /** Runs the M2L kernel. */
528 529 530 531 532 533 534 535 536 537 538 539 540
  	/** M2L  */
  void transferPass(){
#ifdef SCALFMM_USE_EZTRACE	  
    eztrace_start();
#endif
    this->transferPassWithFinalize() ;
    //
#ifdef SCALFMM_USE_EZTRACE
    eztrace_stop();
#endif
  }

    void transferPassWithFinalize(){
541 542 543 544 545 546 547

        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);

        // for each levels
        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
548
            const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
549 550
            FLOG(FTic counterTimeLevel);
            FLOG(computationCounter.tic());
551
            #pragma omp parallel
552 553 554 555 556
            {
                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                const int nbCellsToCompute = workloadM2L[idxLevel][omp_get_thread_num()].nbElements;
                typename OctreeClass::Iterator octreeIterator( workloadM2L[idxLevel][omp_get_thread_num()].iterator);

557 558
                const CellClass* neighbors[342];
                int neighborPositions[342];
559 560

                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
561 562
                    const int counter = tree->getInteractionNeighbors(neighbors, neighborPositions, octreeIterator.getCurrentGlobalCoordinate(), idxLevel, separationCriteria);
                    if(counter) myThreadkernels->M2L( octreeIterator.getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
563 564 565 566
                    octreeIterator.moveRight();
                }

                myThreadkernels->finishedLevelM2L(idxLevel);
567 568 569 570 571


                FAssertLF(omp_get_thread_num() == MaxThreads-1
                          || workloadM2L[idxLevel][omp_get_thread_num()+1].nbElements == 0
                          || octreeIterator.getCurrentGlobalIndex() == workloadM2L[idxLevel][omp_get_thread_num()+1].iterator.getCurrentGlobalIndex());
572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598
            }
            FLOG(computationCounter.tac());
            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
        }

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
    }

    /////////////////////////////////////////////////////////////////////////////
    // Downward
    /////////////////////////////////////////////////////////////////////////////

    /** Runs the L2L kernel. */
    void downardPass(){

        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);


        const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
        // for each levels excepted leaf level
        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < heightMinusOne ; ++idxLevel ){
            FLOG(FTic counterTimeLevel);

            FLOG(computationCounter.tic());
599
            #pragma omp parallel
600 601 602 603 604 605 606 607 608
            {
                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
                const int nbCellsToCompute = workloadL2L[idxLevel][omp_get_thread_num()].nbElements;
                typename OctreeClass::Iterator octreeIterator( workloadL2L[idxLevel][omp_get_thread_num()].iterator);

                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
                    myThreadkernels->L2L( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
                    octreeIterator.moveRight();
                }
609 610 611 612

                FAssertLF(omp_get_thread_num() == MaxThreads-1
                          || workloadL2L[idxLevel][omp_get_thread_num()+1].nbElements == 0
                          || octreeIterator.getCurrentGlobalIndex() == workloadL2L[idxLevel][omp_get_thread_num()+1].iterator.getCurrentGlobalIndex());
613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629
            }
            FLOG(computationCounter.tac());
            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
        }

        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
    }




    /////////////////////////////////////////////////////////////////////////////
    // Direct
    /////////////////////////////////////////////////////////////////////////////

    void L2P(){
630
        #pragma omp parallel
631 632 633 634 635 636 637 638 639 640 641
        {
            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
            const int nbCellsToCompute = workloadL2P[omp_get_thread_num()].nbElements;
            typename OctreeClass::Iterator octreeIterator(workloadL2P[omp_get_thread_num()].iterator);

            for(int idxLeafs = 0 ; idxLeafs < nbCellsToCompute ; ++idxLeafs){
                // We need the current cell that represent the leaf
                // and the list of particles
                myThreadkernels->L2P( octreeIterator.getCurrentCell() , octreeIterator.getCurrentListTargets());
                octreeIterator.moveRight();
            }
642 643 644 645

            FAssertLF(omp_get_thread_num() == MaxThreads-1
                      || workloadL2P[omp_get_thread_num()+1].nbElements == 0
                      || octreeIterator.getCurrentGlobalIndex() == workloadL2P[omp_get_thread_num()+1].iterator.getCurrentGlobalIndex());
646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661
        }
    }

    /** Runs the P2P kernel.
      *
     * \param p2pEnabled Run the P2P kernel.
     * \param l2pEnabled Run the L2P kernel.
     */
    void directPass(){
        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
        FLOG(FTic counterTime);
        FLOG(FTic computationCounter);
        FLOG(FTic computationCounterP2P);

        const int LeafIndex = OctreeHeight - 1;

662
        #pragma omp parallel
663 664 665 666 667
        {
            FLOG(if(!omp_get_thread_num()) computationCounter.tic());

            KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
            // There is a maximum of 26 neighbors
668 669
            ContainerClass* neighbors[26];
            int neighborPositions[26];
670 671 672 673 674 675 676

            for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
                const std::pair<int,int> interval = workloadP2P[idxShape][omp_get_thread_num()];
                for(int idxLeafs = interval.first ; idxLeafs < interval.second ; ++idxLeafs){
                    LeafData& currentIter = leafsDataArray[idxLeafs];
                    // need the current particles and neighbors particles
                    FLOG(if(!omp_get_thread_num()) computationCounterP2P.tic());
677
                    const int counter = tree->getLeafsNeighbors(neighbors, neighborPositions, currentIter.coord, LeafIndex);
678
                    myThreadkernels.P2P(currentIter.coord, currentIter.targets,
679
                                        currentIter.sources, neighbors, neighborPositions, counter);
680 681
                    FLOG(if(!omp_get_thread_num()) computationCounterP2P.tac());
                }
682

683 684 685 686
                FAssertLF(omp_get_thread_num() == MaxThreads-1
                          || interval.second == workloadP2P[idxShape][omp_get_thread_num()+1].first,
                        omp_get_thread_num(), " ==> ", interval.second, " != ", workloadP2P[idxShape][omp_get_thread_num()+1].first);

687
                #pragma omp barrier
688 689 690 691 692 693 694 695 696 697 698 699 700 701
            }
        }

        FLOG(computationCounter.tac());

        FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = "  << counterTime.tacAndElapsed() << "s)\n" );
        FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.cumulated()    << " s\n" );
        FLOG( FLog::Controller << "\t\t Computation P2P :       " << computationCounterP2P.cumulated() << " s\n" );

    }

};

#endif