Add a balance FMM algo - kernel independant

53a0afb5 · BRAMAS Berenger · e0898b5b · 53a0afb5
Commit 53a0afb5 authored 10 years ago by BRAMAS Berenger
--- a/Src/Core/FFmmAlgorithmThreadBalance.hpp
+++ b/Src/Core/FFmmAlgorithmThreadBalance.hpp
+#ifndef FFmmAlgorithmThreadBalanceBALANCE_HPP
+#define FFmmAlgorithmThreadBalanceBALANCE_HPP
+
+#include "../Utils/FAssert.hpp"
+#include "../Utils/FLog.hpp"
+
+#include "../Utils/FTic.hpp"
+#include "../Utils/FGlobal.hpp"
+#include "Utils/FAlgorithmTimers.hpp"
+
+#include "../Containers/FOctree.hpp"
+
+#include "FCoreCommon.hpp"
+
+#include <omp.h>
+#include <vector>
+#include <memory>
+
+/**
+* \author Berenger Bramas (berenger.bramas@inria.fr)
+* \brief Implements an FMM algorithm threaded using OpenMP.
+*
+* Please read the license
+*
+* This class runs a threaded FMM algorithm.
+* It balance the execution between threads.
+*
+* When using this algorithm the P2P is thread safe.
+*
+* This class does not deallocate pointers given to its constructor.
+*/
+template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
+class FFmmAlgorithmThreadBalance : public FAbstractAlgorithm, public FAlgorithmTimers{
+    OctreeClass* const tree;                  ///< The octree to work on.
+    KernelClass** kernels;                    ///< The kernels.
+
+    static const int SizeShape = 3*3*3;
+
+    const int MaxThreads;                     ///< The maximum number of threads.
+
+    const int OctreeHeight;                   ///< The height of the given tree.
+
+    const int leafLevelSeperationCriteria;
+
+public:
+    /** Class constructor
+     *
+     * The constructor needs the octree and the kernels used for computation.
+     * \param inTree the octree to work on.
+     * \param inKernels the kernels to call.
+     * \param inUserChunckSize To specify the chunck size in the loops (-1 is static, 0 is N/p^2, otherwise it
+     * directly used as the number of item to proceed together), default is 10
+     *
+     * \except An exception is thrown if one of the arguments is NULL.
+     */
+    FFmmAlgorithmThreadBalance(OctreeClass* const inTree, KernelClass* const inKernels,
+                               const int inUserChunkSize = 10, const int inLeafLevelSeperationCriteria = 1)
+        : tree(inTree) , kernels(nullptr),
+          MaxThreads(omp_get_max_threads()), OctreeHeight(tree->getHeight()),
+          leafLevelSeperationCriteria(inLeafLevelSeperationCriteria) {
+        FAssertLF(tree, "tree cannot be null");
+
+        this->kernels = new KernelClass*[MaxThreads];
+#pragma omp parallel for schedule(static)
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
+#pragma omp critical (InitFFmmAlgorithmThreadBalance)
+            {
+                this->kernels[idxThread] = new KernelClass(*inKernels);
+            }
+        }
+
+        FAbstractAlgorithm::setNbLevelsInTree(OctreeHeight);
+        buildThreadIntervals();
+
+        FLOG(FLog::Controller << "FFmmAlgorithmThreadBalance (Max Thread " << omp_get_max_threads() << ")\n");
+    }
+
+    /** Default destructor */
+    virtual ~FFmmAlgorithmThreadBalance(){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
+            delete this->kernels[idxThread];
+        }
+        delete [] this->kernels;
+    }
+
+    /**
+      * Runs the complete algorithm.
+      */
+    void executeCore(const unsigned operationsToProceed) override {
+
+        Timers[P2MTimer].tic();
+        if(operationsToProceed & FFmmP2M) bottomPass();
+        Timers[P2MTimer].tac();
+
+        Timers[M2MTimer].tic();
+        if(operationsToProceed & FFmmM2M) upwardPass();
+        Timers[M2MTimer].tac();
+
+        Timers[M2LTimer].tic();
+        if(operationsToProceed & FFmmM2L) transferPass();
+        Timers[M2LTimer].tac();
+
+        Timers[L2LTimer].tic();
+        if(operationsToProceed & FFmmL2L) downardPass();
+        Timers[L2LTimer].tac();
+
+        Timers[NearTimer].tic();
+        if(operationsToProceed & FFmmL2P) L2P();
+        if(operationsToProceed & FFmmP2P) directPass();
+        Timers[NearTimer].tac();
+    }
+
+protected:
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+    /** The workload contains what a thread need to perfom its interval of work */
+    struct Workload{
+        typename OctreeClass::Iterator iterator;
+        int nbElements;
+    };
+
+    //< The work per thread for the P2M
+    std::vector<Workload> workloadP2M;
+    //< The work per level and per thread for the M2M
+    std::vector<std::vector<Workload>> workloadM2M;
+    //< The work per level and per thread for the M2L
+    std::vector<std::vector<Workload>> workloadM2L;
+    //< The work per level and per thread for the L2L
+    std::vector<std::vector<Workload>> workloadL2L;
+    //< The work per thread for the L2P
+    std::vector<Workload> workloadL2P;
+    //< The work per shape and per thread for the P2P
+    std::vector<std::vector<std::pair<int,int>>> workloadP2P;
+
+    /** This structure is needed by the thread for the P2P because of the colors */
+    struct LeafData{
+        MortonIndex index;
+        FTreeCoordinate coord;
+        ContainerClass* targets;
+        ContainerClass* sources;
+    };
+    /** Direct access to the data for the P2P */
+    std::unique_ptr<LeafData[]> leafsDataArray;
+
+    /** This struct is used during the preparation of the interval */
+    struct WorkloadTemp{
+        typename OctreeClass::Iterator iterator;
+        FSize amountOfWork;
+    };
+
+    /** From a vector of work (workPerElement) generate the interval */
+    void generateIntervalFromWorkload(std::vector<Workload>* intervals, const FSize totalWork,
+                                      WorkloadTemp* workPerElement, const FSize nbElements) const {
+        // Now split between thread
+        (*intervals).resize(MaxThreads);
+        // Ideally each thread will have this
+        const FSize idealWork = (totalWork/MaxThreads);
+        // Assign default value for first thread
+        int idxThread = 0;
+        (*intervals)[idxThread].iterator = workPerElement[0].iterator;
+        (*intervals)[idxThread].nbElements = 1;
+        FSize assignWork = workPerElement[0].amountOfWork;
+        for(int idxElement = 1 ; idxElement < nbElements ; ++idxElement){
+            // is it more balance if we add the current element to the current thread
+            if(FMath::Abs((idxThread+1)*idealWork - assignWork) <
+                    FMath::Abs((idxThread+1)*idealWork - assignWork - workPerElement[idxElement].amountOfWork)
+                    && idxThread != MaxThreads-1){
+                /// FLOG(FLog::Controller << "[Balance] Shape Thread " << idxThread << " goes from "
+                ///      << (*intervals)[idxThread].iterator.getCurrentGlobalIndex() << " nb " << (*intervals)[idxThread].nbElements << "\n");
+                // if not start filling the next thread
+                idxThread += 1;
+                (*intervals)[idxThread].iterator = workPerElement[idxElement].iterator;
+                (*intervals)[idxThread].nbElements = 0;
+            }
+            (*intervals)[idxThread].nbElements += 1;
+            assignWork += workPerElement[idxElement].amountOfWork;
+        }
+        /// FLOG(FLog::Controller << "[Balance] Shape Thread " << idxThread << " goes from "
+        ///      << (*intervals)[idxThread].iterator.getCurrentGlobalIndex() << " nb " << (*intervals)[idxThread].nbElements << "\n");
+    }
+
+    void buildThreadIntervals(){
+        // Reset the vectors
+        workloadP2M.clear();
+        workloadM2M.clear();
+        workloadM2L.clear();
+        workloadL2L.clear();
+        workloadL2P.clear();
+        workloadP2P.clear();
+
+        // Count the number of leaves and color elements
+        int shapeLeaves[SizeShape] = {0};
+        int leafsNumber = 0;
+        {
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.gotoBottomLeft();
+            do{
+                ++leafsNumber;
+                const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
+                ++shapeLeaves[(coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3)];
+            } while(octreeIterator.moveRight());
+        }
+
+        // Allocate the working buffer
+        std::unique_ptr<WorkloadTemp[]> workloadBuffer(new WorkloadTemp[leafsNumber]);
+
+        { // Prepare P2M
+            /// FLOG(FLog::Controller << "[Balance] P2M:\n");
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.gotoBottomLeft();
+            FSize idxLeaf = 0;
+            FSize totalWork = 0;
+            do{
+                // Keep track of tree iterator
+                workloadBuffer[idxLeaf].iterator = octreeIterator;
+                // Count the nb of particles as amount of work in the leaf
+                workloadBuffer[idxLeaf].amountOfWork = octreeIterator.getCurrentListSrc()->getNbParticles();
+                // Keep the total amount of work
+                totalWork += workloadBuffer[idxLeaf].amountOfWork;
+                ++idxLeaf;
+            } while(octreeIterator.moveRight());
+
+            generateIntervalFromWorkload(&workloadP2M, totalWork, workloadBuffer.get(), idxLeaf);
+        }
+
+        { // Prepare L2P
+            /// FLOG(FLog::Controller << "[Balance] L2P:\n");
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.gotoBottomLeft();
+            FSize idxLeaf = 0;
+            FSize totalWork = 0;
+            do{
+                // Keep track of tree iterator
+                workloadBuffer[idxLeaf].iterator = octreeIterator;
+                // Count the nb of particles as amount of work in the leaf
+                workloadBuffer[idxLeaf].amountOfWork = octreeIterator.getCurrentListTargets()->getNbParticles();
+                // Keep the total amount of work
+                totalWork += workloadBuffer[idxLeaf].amountOfWork;
+                ++idxLeaf;
+            } while(octreeIterator.moveRight());
+
+            generateIntervalFromWorkload(&workloadL2P, totalWork, workloadBuffer.get(), idxLeaf);
+        }
+
+        {// Do it for the M2L
+            /// FLOG(FLog::Controller << "[Balance] M2L:\n");
+            workloadM2L.resize(OctreeHeight);
+            typename OctreeClass::Iterator avoidGotoLeftIterator(tree);
+            avoidGotoLeftIterator.gotoBottomLeft();
+
+            const CellClass* neighbors[343];
+
+            for(int idxLevel = OctreeHeight-1 ; idxLevel >= 2 ; --idxLevel){
+                FLOG(FLog::Controller << "[Balance] \t level " << idxLevel << ":\n");
+                typename OctreeClass::Iterator octreeIterator(avoidGotoLeftIterator);
+                avoidGotoLeftIterator.moveUp();
+
+                FSize idxCell = 0;
+                FSize totalWork = 0;
+                do{
+                    // Keep track of tree iterator
+                    workloadBuffer[idxCell].iterator = octreeIterator;
+                    // Count the nb of M2L for this cell
+                    workloadBuffer[idxCell].amountOfWork = tree->getInteractionNeighbors(neighbors, octreeIterator.getCurrentGlobalCoordinate(), idxLevel, 1);
+                    // Keep the total amount of work
+                    totalWork += workloadBuffer[idxCell].amountOfWork;
+                    ++idxCell;
+                } while(octreeIterator.moveRight());
+
+                // Now split between thread
+                generateIntervalFromWorkload(&workloadM2L[idxLevel], totalWork, workloadBuffer.get(), idxCell);
+            }
+        }
+        {// Do it for the M2M L2L
+            /// FLOG(FLog::Controller << "[Balance] M2M L2L:\n");
+            workloadM2M.resize(OctreeHeight);
+            workloadL2L.resize(OctreeHeight);
+            typename OctreeClass::Iterator avoidGotoLeftIterator(tree);
+            avoidGotoLeftIterator.gotoBottomLeft();
+            avoidGotoLeftIterator.moveUp();
+
+            for(int idxLevel = OctreeHeight-2 ; idxLevel >= 2 ; --idxLevel){
+                FLOG(FLog::Controller << "[Balance] \t level " << idxLevel << ":\n");
+                typename OctreeClass::Iterator octreeIterator(avoidGotoLeftIterator);
+                avoidGotoLeftIterator.moveUp();
+
+                FSize idxCell = 0;
+                FSize totalWork = 0;
+                do{
+                    // Keep track of tree iterator
+                    workloadBuffer[idxCell].iterator = octreeIterator;
+                    // Count the nb of children of the current cell
+                    workloadBuffer[idxCell].amountOfWork = 0;
+                    CellClass** child = octreeIterator.getCurrentChild();
+                    for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                        if(child[idxChild]) workloadBuffer[idxCell].amountOfWork += 1;
+                    }
+                    // Keep the total amount of work
+                    totalWork += workloadBuffer[idxCell].amountOfWork;
+                    ++idxCell;
+                } while(octreeIterator.moveRight());
+
+                // Now split between thread
+                generateIntervalFromWorkload(&workloadM2M[idxLevel], totalWork, workloadBuffer.get(), idxCell);
+                generateIntervalFromWorkload(&workloadL2L[idxLevel], totalWork, workloadBuffer.get(), idxCell);
+            }
+        }
+
+        {
+            // Prepare the P2P
+            const int LeafIndex = OctreeHeight - 1;
+            leafsDataArray.reset(new LeafData[leafsNumber]);
+
+            // We need the offset for each color
+            int startPosAtShape[SizeShape] = {0};
+            for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
+                startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaves[idxShape-1];
+            }
+
+            // Prepare each color
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.gotoBottomLeft();
+
+            FSize workPerShape[SizeShape] = {0};
+
+            // for each leafs
+            for(int idxLeaf = 0 ; idxLeaf < leafsNumber ; ++idxLeaf){
+                const FTreeCoordinate& coord = octreeIterator.getCurrentGlobalCoordinate();
+                const int shapePosition = (coord.getX()%3)*9 + (coord.getY()%3)*3 + (coord.getZ()%3);
+
+                const int positionToWork = startPosAtShape[shapePosition]++;
+
+                leafsDataArray[positionToWork].index   = octreeIterator.getCurrentGlobalIndex();
+                leafsDataArray[positionToWork].coord   = coord;
+                leafsDataArray[positionToWork].targets = octreeIterator.getCurrentListTargets();
+                leafsDataArray[positionToWork].sources = octreeIterator.getCurrentListSrc();
+
+                // For now the cost is simply based on the number of particles
+                const FSize nbPartInLeaf = octreeIterator.getCurrentListTargets()->getNbParticles();
+                workloadBuffer[positionToWork].amountOfWork = nbPartInLeaf*nbPartInLeaf;
+                ContainerClass* neighbors[27];
+                tree->getLeafsNeighbors(neighbors, octreeIterator.getCurrentGlobalCoordinate(), LeafIndex);
+                for(int idxNeigh = 0 ; idxNeigh < 27 ; ++idxNeigh){
+                    if(neighbors[idxNeigh]){
+                        workloadBuffer[positionToWork].amountOfWork +=
+                                 nbPartInLeaf * neighbors[idxNeigh]->getNbParticles();
+                    }
+                }
+
+                workPerShape[shapePosition] += workloadBuffer[positionToWork].amountOfWork;
+
+                octreeIterator.moveRight();
+            }
+
+            workloadP2P.resize(SizeShape);
+            int offsetShape = 0;
+
+            for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
+                std::vector<std::pair<int,int>>* intervals = &workloadP2P[idxShape];
+                const int nbElements = shapeLeaves[idxShape];
+                const FSize totalWork = workPerShape[idxShape];
+
+                // Now split between thread
+                (*intervals).resize(MaxThreads);
+                // Ideally each thread will have this
+                const FSize idealWork = (totalWork/MaxThreads);
+                // Assign default value for first thread
+                int idxThread = 0;
+                (*intervals)[idxThread].first = offsetShape;
+                FSize assignWork = workloadBuffer[0].amountOfWork;
+                for(int idxElement = 1+offsetShape ; idxElement < nbElements+offsetShape ; ++idxElement){
+                    if(FMath::Abs((idxThread+1)*idealWork - assignWork) <
+                            FMath::Abs((idxThread+1)*idealWork - assignWork - workloadBuffer[idxElement].amountOfWork)
+                            && idxThread != MaxThreads-1){
+                        (*intervals)[idxThread].second = idxElement;
+                        /// FLOG(FLog::Controller << "[Balance] Shape " << idxShape << " Thread " << idxThread << " goes from "
+                        ///      << (*intervals)[idxThread].first << " to " << (*intervals)[idxThread].second << "\n");
+                        idxThread += 1;
+                        (*intervals)[idxThread].first = idxElement;
+                    }
+                    assignWork += workloadBuffer[idxElement].amountOfWork;
+                }
+                (*intervals)[idxThread].second = nbElements + offsetShape;
+
+                /// FLOG(FLog::Controller << "[Balance] Shape " << idxShape << " Thread " << idxThread << " goes from "
+                ///      << (*intervals)[idxThread].first << " to " << (*intervals)[idxThread].second << "\n");
+
+                offsetShape += nbElements;
+            }
+        }
+    }
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+
+    /** Runs the P2M kernel. */
+    void bottomPass(){
+        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
+        FLOG(FTic counterTime);
+
+        FLOG(FTic computationCounter);
+#pragma omp parallel
+        {
+            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+            const int nbCellsToCompute = workloadP2M[omp_get_thread_num()].nbElements;
+            typename OctreeClass::Iterator octreeIterator(workloadP2M[omp_get_thread_num()].iterator);
+
+            for(int idxLeafs = 0 ; idxLeafs < nbCellsToCompute ; ++idxLeafs){
+                // We need the current cell that represent the leaf
+                // and the list of particles
+                myThreadkernels->P2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentListSrc());
+                octreeIterator.moveRight();
+            }
+        }
+        FLOG(computationCounter.tac() );
+
+        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
+
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Upward
+    /////////////////////////////////////////////////////////////////////////////
+
+    /** Runs the M2M kernel. */
+    void upwardPass(){
+        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+
+        // for each levels
+        for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
+            FLOG(FTic counterTimeLevel);
+
+            FLOG(computationCounter.tic());
+#pragma omp parallel
+            {
+                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                const int nbCellsToCompute = workloadM2M[idxLevel][omp_get_thread_num()].nbElements;
+                typename OctreeClass::Iterator octreeIterator( workloadM2M[idxLevel][omp_get_thread_num()].iterator);
+
+                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
+                    // We need the current cell and the child
+                    // child is an array (of 8 child) that may be null
+                    myThreadkernels->M2M( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
+                    octreeIterator.moveRight();
+                }
+            }
+
+            FLOG(computationCounter.tac());
+            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
+        }
+
+
+        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Transfer
+    /////////////////////////////////////////////////////////////////////////////
+
+    /** Runs the M2L kernel. */
+    void transferPass(){
+
+        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+
+        // for each levels
+        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
+            const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeperationCriteria);
+            FLOG(FTic counterTimeLevel);
+            FLOG(computationCounter.tic());
+#pragma omp parallel
+            {
+                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                const int nbCellsToCompute = workloadM2L[idxLevel][omp_get_thread_num()].nbElements;
+                typename OctreeClass::Iterator octreeIterator( workloadM2L[idxLevel][omp_get_thread_num()].iterator);
+
+                const CellClass* neighbors[343];
+
+                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
+                    const int counter = tree->getInteractionNeighbors(neighbors, octreeIterator.getCurrentGlobalCoordinate(), idxLevel, separationCriteria);
+                    if(counter) myThreadkernels->M2L( octreeIterator.getCurrentCell() , neighbors, counter, idxLevel);
+                    octreeIterator.moveRight();
+                }
+
+                myThreadkernels->finishedLevelM2L(idxLevel);
+            }
+            FLOG(computationCounter.tac());
+            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
+        }
+
+        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+    }
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Downward
+    /////////////////////////////////////////////////////////////////////////////
+
+    /** Runs the L2L kernel. */
+    void downardPass(){
+
+        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+
+
+        const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
+        // for each levels excepted leaf level
+        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < heightMinusOne ; ++idxLevel ){
+            FLOG(FTic counterTimeLevel);
+
+            FLOG(computationCounter.tic());
+#pragma omp parallel
+            {
+                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                const int nbCellsToCompute = workloadL2L[idxLevel][omp_get_thread_num()].nbElements;
+                typename OctreeClass::Iterator octreeIterator( workloadL2L[idxLevel][omp_get_thread_num()].iterator);
+
+                for(int idxCell = 0 ; idxCell < nbCellsToCompute ; ++idxCell){
+                    myThreadkernels->L2L( octreeIterator.getCurrentCell() , octreeIterator.getCurrentChild(), idxLevel);
+                    octreeIterator.moveRight();
+                }
+            }
+            FLOG(computationCounter.tac());
+            FLOG( FLog::Controller << "\t\t>> Level " << idxLevel << " = "  << counterTimeLevel.tacAndElapsed() << "s\n" );
+        }
+
+        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+    }
+
+
+
+
+    /////////////////////////////////////////////////////////////////////////////
+    // Direct
+    /////////////////////////////////////////////////////////////////////////////
+
+    void L2P(){
+#pragma omp parallel
+        {
+            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+            const int nbCellsToCompute = workloadL2P[omp_get_thread_num()].nbElements;
+            typename OctreeClass::Iterator octreeIterator(workloadL2P[omp_get_thread_num()].iterator);
+
+            for(int idxLeafs = 0 ; idxLeafs < nbCellsToCompute ; ++idxLeafs){
+                // We need the current cell that represent the leaf
+                // and the list of particles
+                myThreadkernels->L2P( octreeIterator.getCurrentCell() , octreeIterator.getCurrentListTargets());
+                octreeIterator.moveRight();
+            }
+        }
+    }
+
+    /** Runs the P2P kernel.
+      *
+     * \param p2pEnabled Run the P2P kernel.
+     * \param l2pEnabled Run the L2P kernel.
+     */
+    void directPass(){
+        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+        FLOG(FTic computationCounterP2P);
+
+        const int LeafIndex = OctreeHeight - 1;
+
+#pragma omp parallel
+        {
+            FLOG(if(!omp_get_thread_num()) computationCounter.tic());
+
+            KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
+            // There is a maximum of 26 neighbors
+            ContainerClass* neighbors[27];
+
+            for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
+                const std::pair<int,int> interval = workloadP2P[idxShape][omp_get_thread_num()];
+                for(int idxLeafs = interval.first ; idxLeafs < interval.second ; ++idxLeafs){
+                    LeafData& currentIter = leafsDataArray[idxLeafs];
+                    // need the current particles and neighbors particles
+                    FLOG(if(!omp_get_thread_num()) computationCounterP2P.tic());
+                    const int counter = tree->getLeafsNeighbors(neighbors, currentIter.coord, LeafIndex);
+                    myThreadkernels.P2P(currentIter.coord, currentIter.targets,
+                                        currentIter.sources, neighbors, counter);
+                    FLOG(if(!omp_get_thread_num()) computationCounterP2P.tac());
+                }
+            }
+        }
+
+        FLOG(computationCounter.tac());
+
+        FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = "  << counterTime.tacAndElapsed() << "s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.cumulated()    << " s\n" );
+        FLOG( FLog::Controller << "\t\t Computation P2P :       " << computationCounterP2P.cumulated() << " s\n" );
+
+    }
+
+};
+
+#endif