From 64603413d5cd8bd6e52f43fe01dafecdd827e257 Mon Sep 17 00:00:00 2001
From: Berenger Bramas <bbramas@mpcdf.mpg.de>
Date: Thu, 19 Jan 2017 16:48:54 +0100
Subject: [PATCH] Make the MPI tsm working and add a test

---
 Src/Core/FFmmAlgorithmThreadProcTsm.hpp | 2966 ++++++++++++-----------
 Src/Core/FFmmAlgorithmThreadTsm.hpp     |    8 +-
 Tests/Utils/testFmmAlgorithmProcTsm.cpp |  505 ++++
 3 files changed, 2034 insertions(+), 1445 deletions(-)
 create mode 100644 Tests/Utils/testFmmAlgorithmProcTsm.cpp

diff --git a/Src/Core/FFmmAlgorithmThreadProcTsm.hpp b/Src/Core/FFmmAlgorithmThreadProcTsm.hpp
index 517430893..55413421e 100644
--- a/Src/Core/FFmmAlgorithmThreadProcTsm.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProcTsm.hpp
@@ -29,7 +29,6 @@
 // ==== CMAKE =====
 // @FUSE_MPI
 // ================
-// Keep in private GIT
 //
 //
 #ifndef FFMMALGORITHMTHREADPROC_HPP
@@ -66,7 +65,7 @@
 #include <vector>
 
 /**
- * @author 
+ * @author
  *
  * Please read the license
  *
@@ -88,1219 +87,1294 @@
 template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass, class P2PExclusionClass = FP2PMiddleExclusion>
 class FFmmAlgorithmThreadProcTsm : public FAbstractAlgorithm, public FAlgorithmTimers {
 private:
-  OctreeClass* const tree;     ///< The octree to work on
-  KernelClass** kernels;       ///< The kernels
-
-  const FMpi::FComm comm;      ///< MPI comm
-  FMpi::FComm fcomCompute;
-
-  /// Used to store pointers to cells/leafs to work with
-  typename OctreeClass::Iterator* iterArray;
-  /// Used to store pointers to cells/leafs to send/rcv
-  typename OctreeClass::Iterator* iterArrayComm;
-
-  int numberOfLeafs;           ///< To store the size at the previous level
-  const int MaxThreads;        ///< Max number of thread allowed by openmp
-  const int nbProcessOrig;         ///< Process count
-  const int idProcessOrig;         ///< Current process id
-  int nbProcess;        ///< Process count
-  int idProcess;        ///< Current process id
-  const int OctreeHeight;      ///< Tree height
-
-  const int userChunkSize;
-  const int leafLevelSeparationCriteria;
-
-  /** An interval is the morton index interval
-   * that a proc uses (i.e. it holds data in this interval) */
-  struct Interval{
-    MortonIndex leftIndex;
-    MortonIndex rightIndex;
-  };
-
-  /// Current process interval
-  Interval*const intervals;
-  /// All processes intervals
-  Interval*const workingIntervalsPerLevel;
-
-  /// Get an interval from a process id and tree level
-  Interval& getWorkingInterval( int level,  int proc){
-    return workingIntervalsPerLevel[OctreeHeight * proc + level];
-  }
-
-  /// Get an interval from a process id and tree level
-  const Interval& getWorkingInterval( int level,  int proc) const {
-    return workingIntervalsPerLevel[OctreeHeight * proc + level];
-  }
-
-  /// Does \a procIdx have work at given \a idxLevel
-  /** i.e. does it hold cells and is responsible of them ? */
-  bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
-    return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
-  }
-
-  /** True if the \a idxProc left cell at \a idxLevel+1 has the same parent as us for our right cell */
-  bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
-    return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
-  }
-
-  /** True if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
-  bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
-    return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
-  }
+    OctreeClass* const tree;     ///< The octree to work on
+    KernelClass** kernels;       ///< The kernels
+
+    const FMpi::FComm comm;      ///< MPI comm
+    FMpi::FComm fcomCompute;
+
+    /// Used to store pointers to cells/leafs to work with
+    typename OctreeClass::Iterator* iterArray;
+    /// Used to store pointers to cells/leafs to send/rcv
+    typename OctreeClass::Iterator* iterArrayComm;
+
+    int numberOfLeafs;           ///< To store the size at the previous level
+    const int MaxThreads;        ///< Max number of thread allowed by openmp
+    const int nbProcessOrig;         ///< Process count
+    const int idProcessOrig;         ///< Current process id
+    int nbProcess;        ///< Process count
+    int idProcess;        ///< Current process id
+    const int OctreeHeight;      ///< Tree height
+
+    const int userChunkSize;
+    const int leafLevelSeparationCriteria;
+
+    /** An interval is the morton index interval
+     * that a proc uses (i.e. it holds data in this interval) */
+    struct Interval{
+        MortonIndex leftIndex;
+        MortonIndex rightIndex;
+    };
+
+    /// Current process interval
+    Interval*const intervals;
+    /// All processes intervals
+    Interval*const workingIntervalsPerLevel;
+
+    /// Get an interval from a process id and tree level
+    Interval& getWorkingInterval( int level,  int proc){
+        return workingIntervalsPerLevel[OctreeHeight * proc + level];
+    }
+
+    /// Get an interval from a process id and tree level
+    const Interval& getWorkingInterval( int level,  int proc) const {
+        return workingIntervalsPerLevel[OctreeHeight * proc + level];
+    }
+
+    /// Does \a procIdx have work at given \a idxLevel
+    /** i.e. does it hold cells and is responsible of them ? */
+    bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
+        return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
+    }
+
+    /** True if the \a idxProc left cell at \a idxLevel+1 has the same parent as us for our right cell */
+    bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
+        return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
+    }
+
+    /** True if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
+    bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
+        return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
+    }
 
 public:
-  /// Get current process interval at given \a level
-  Interval& getWorkingInterval( int level){
-    return getWorkingInterval(level, idProcess);
-  }
-
-  /// Does the current process has some work at this level ?
-  bool hasWorkAtLevel( int level){
-    return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).rightIndex) < (getWorkingInterval(level, idProcess).rightIndex);
-  }
-
-  /**@brief Constructor
-   * @param inTree the octree to work on
-   * @param inKernels the kernels to call
-   *
-   * An assert is launched if one of the arguments is null
-   */
-  FFmmAlgorithmThreadProcTsm(const FMpi::FComm& inComm, OctreeClass* const inTree,
-			     KernelClass* const inKernels,
-			     const int inUserChunkSize = 10, const int inLeafLevelSeperationCriteria = 1) :
-    tree(inTree),
-    kernels(nullptr),
-    comm(inComm),
-    fcomCompute(inComm),
-    iterArray(nullptr),
-    iterArrayComm(nullptr),
-    numberOfLeafs(0),
-    MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
-    nbProcessOrig(inComm.processCount()),
-    idProcessOrig(inComm.processId()),
-    nbProcess(0),
-    idProcess(0),
-    OctreeHeight(tree->getHeight()),
-    userChunkSize(inUserChunkSize),
-    leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
-    intervals(new Interval[inComm.processCount()]),
-    workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
-    FAssertLF(tree, "tree cannot be null");
-    FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
-
-    this->kernels = new KernelClass*[MaxThreads];
-#pragma omp parallel num_threads(MaxThreads)
-    {
-#pragma omp critical (InitFFmmAlgorithmThreadProcTsm)
-      {
-	this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
-      }
+    /// Get current process interval at given \a level
+    Interval& getWorkingInterval( int level){
+        return getWorkingInterval(level, idProcess);
     }
 
-    FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());
+    /// Does the current process has some work at this level ?
+    bool hasWorkAtLevel( int level){
+        return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).rightIndex) < (getWorkingInterval(level, idProcess).rightIndex);
+    }
+
+    /**@brief Constructor
+     * @param inTree the octree to work on
+     * @param inKernels the kernels to call
+     *
+     * An assert is launched if one of the arguments is null
+     */
+    FFmmAlgorithmThreadProcTsm(const FMpi::FComm& inComm, OctreeClass* const inTree,
+                               KernelClass* const inKernels,
+                               const int inUserChunkSize = 10, const int inLeafLevelSeperationCriteria = 1) :
+        tree(inTree),
+        kernels(nullptr),
+        comm(inComm),
+        fcomCompute(inComm),
+        iterArray(nullptr),
+        iterArrayComm(nullptr),
+        numberOfLeafs(0),
+        MaxThreads(FEnv::GetValue("SCALFMM_ALGO_NUM_THREADS",omp_get_max_threads())),
+        nbProcessOrig(inComm.processCount()),
+        idProcessOrig(inComm.processId()),
+        nbProcess(0),
+        idProcess(0),
+        OctreeHeight(tree->getHeight()),
+        userChunkSize(inUserChunkSize),
+        leafLevelSeparationCriteria(inLeafLevelSeperationCriteria),
+        intervals(new Interval[inComm.processCount()]),
+        workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()]) {
+        FAssertLF(tree, "tree cannot be null");
+        FAssertLF(leafLevelSeparationCriteria < 3, "Separation criteria should be < 3");
+
+        this->kernels = new KernelClass*[MaxThreads];
+#pragma omp parallel num_threads(MaxThreads)
+        {
+#pragma omp critical (InitFFmmAlgorithmThreadProcTsm)
+            {
+                this->kernels[omp_get_thread_num()] = new KernelClass(*inKernels);
+            }
+        }
 
-    FLOG(FLog::Controller << "FFmmAlgorithmThreadProcTsm\n");
-    FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcessOrig << ", I am " << idProcessOrig << ".\n");
-    FLOG(FLog::Controller << "Chunck Size = " << userChunkSize << "\n");
-  }
+        FAbstractAlgorithm::setNbLevelsInTree(tree->getHeight());
 
-  /// Default destructor
-  virtual ~FFmmAlgorithmThreadProcTsm(){
-    for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
-      delete this->kernels[idxThread];
+        FLOG(FLog::Controller << "FFmmAlgorithmThreadProcTsm\n");
+        FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcessOrig << ", I am " << idProcessOrig << ".\n");
+        FLOG(FLog::Controller << "Chunck Size = " << userChunkSize << "\n");
     }
-    delete [] this->kernels;
 
-    delete [] intervals;
-    delete [] workingIntervalsPerLevel;
-  }
+    /// Default destructor
+    virtual ~FFmmAlgorithmThreadProcTsm(){
+        for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
+            delete this->kernels[idxThread];
+        }
+        delete [] this->kernels;
+
+        delete [] intervals;
+        delete [] workingIntervalsPerLevel;
+    }
 
 protected:
-  /**
-   * To execute the fmm algorithm
-   * Call this function to run the complete algorithm
-   */
-  void executeCore(const unsigned operationsToProceed) override {
-    // We are not involve if the tree is empty
-    const int iHaveParticles = (!tree->isEmpty());
+    /**
+     * To execute the fmm algorithm
+     * Call this function to run the complete algorithm
+     */
+    void executeCore(const unsigned operationsToProceed) override {
+        // We are not involve if the tree is empty
+        const int iHaveParticles = (!tree->isEmpty());
 
-    std::unique_ptr<int[]> hasParticles(new int[comm.processCount()]);
-    FMpi::Assert( MPI_Allgather(const_cast<int*>(&iHaveParticles), 1,MPI_INT,
-				hasParticles.get(), 1, MPI_INT,
-				comm.getComm()), __LINE__);
+        std::unique_ptr<int[]> hasParticles(new int[comm.processCount()]);
+        FMpi::Assert( MPI_Allgather(const_cast<int*>(&iHaveParticles), 1,MPI_INT,
+                                    hasParticles.get(), 1, MPI_INT,
+                                    comm.getComm()), __LINE__);
 
-    fcomCompute = FMpi::FComm(comm);
-    fcomCompute.groupReduce(hasParticles.get());
+        fcomCompute = FMpi::FComm(comm);
+        fcomCompute.groupReduce(hasParticles.get());
 
-    if(iHaveParticles){
+        if(iHaveParticles){
 
-      nbProcess = fcomCompute.processCount();
-      idProcess = fcomCompute.processId();
+            nbProcess = fcomCompute.processCount();
+            idProcess = fcomCompute.processId();
 
-      FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
+            FLOG(FLog::Controller << "Max threads = "  << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
 
-      // Count leaf
+            // Count leaf
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_resume();
+            eztrace_resume();
 #endif
-      this->numberOfLeafs = 0;
-      {
-	Interval myFullInterval;
-	{//Building the interval with the first and last leaves (and count the number of leaves)
-	  typename OctreeClass::Iterator octreeIterator(tree);
-	  octreeIterator.gotoBottomLeft();
-	  myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
-	  do{
-	    ++this->numberOfLeafs;
-	  } while(octreeIterator.moveRight());
-	  myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
-	}
-	// Allocate a number to store the pointer of the cells at a level
-	iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
-	iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
-	FAssertLF(iterArray,     "iterArray     bad alloc");
-	FAssertLF(iterArrayComm, "iterArrayComm bad alloc");
-
-	// We get the leftIndex/rightIndex indexes from each procs
-	FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, fcomCompute.getComm()),  __LINE__ );
-
-	// Build my intervals for all levels
-	std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
-	// At leaf level we know it is the full interval
-	myIntervals[OctreeHeight - 1] = myFullInterval;
-
-	// We can estimate the interval for each level by using the parent/child relation
-	for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
-	  myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
-	  myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
-	}
-
-	// Process 0 uses the estimates as real intervals, but other processes
-	// should remove cells that belong to others
-	if(idProcess != 0){
-	  //We test for each level if process on left (idProcess-1) own cell I thought I owned
-	  typename OctreeClass::Iterator octreeIterator(tree);
-	  octreeIterator.gotoBottomLeft();
-	  octreeIterator.moveUp();
-
-	  // At h-1 the working limit is the parent of the right cell of the proc on the left
-	  MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;
-
-	  // We check if we have no more work to do
-	  int nullIntervalFromLevel = 0;
-
-	  for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
-	    while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
-	      if( !octreeIterator.moveRight() ){
-		// We cannot move right we are not owner of any more cell
-		nullIntervalFromLevel = idxLevel;
-		break;
-	      }
-	    }
-	    // If we are responsible for some cells at this level keep the first index
-	    if(nullIntervalFromLevel == 0){
-	      myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
-	      octreeIterator.moveUp();
-	      workingLimitAtLevel >>= 3;
-	    }
-	  }
-	  // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
-	  for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
-	    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
-	  }
-	}
-
-	// We get the leftIndex/rightIndex indexes from each procs
-	FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
-					workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, fcomCompute.getComm()),  __LINE__ );
-      }
+            this->numberOfLeafs = 0;
+            {
+                Interval myFullInterval;
+                {//Building the interval with the first and last leaves (and count the number of leaves)
+                    typename OctreeClass::Iterator octreeIterator(tree);
+                    octreeIterator.gotoBottomLeft();
+                    myFullInterval.leftIndex = octreeIterator.getCurrentGlobalIndex();
+                    do{
+                        ++this->numberOfLeafs;
+                    } while(octreeIterator.moveRight());
+                    myFullInterval.rightIndex = octreeIterator.getCurrentGlobalIndex();
+                }
+                // Allocate a number to store the pointer of the cells at a level
+                iterArray     = new typename OctreeClass::Iterator[numberOfLeafs];
+                iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
+                FAssertLF(iterArray,     "iterArray     bad alloc");
+                FAssertLF(iterArrayComm, "iterArrayComm bad alloc");
+
+                // We get the leftIndex/rightIndex indexes from each procs
+                FMpi::MpiAssert( MPI_Allgather( &myFullInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, fcomCompute.getComm()),  __LINE__ );
+
+                // Build my intervals for all levels
+                std::unique_ptr<Interval[]> myIntervals(new Interval[OctreeHeight]);
+                // At leaf level we know it is the full interval
+                myIntervals[OctreeHeight - 1] = myFullInterval;
+
+                // We can estimate the interval for each level by using the parent/child relation
+                for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
+                    myIntervals[idxLevel].leftIndex = myIntervals[idxLevel+1].leftIndex >> 3;
+                    myIntervals[idxLevel].rightIndex = myIntervals[idxLevel+1].rightIndex >> 3;
+                }
+
+                // Process 0 uses the estimates as real intervals, but other processes
+                // should remove cells that belong to others
+                if(idProcess != 0){
+                    //We test for each level if process on left (idProcess-1) own cell I thought I owned
+                    typename OctreeClass::Iterator octreeIterator(tree);
+                    octreeIterator.gotoBottomLeft();
+                    octreeIterator.moveUp();
+
+                    // At h-1 the working limit is the parent of the right cell of the proc on the left
+                    MortonIndex workingLimitAtLevel = intervals[idProcess-1].rightIndex >> 3;
+
+                    // We check if we have no more work to do
+                    int nullIntervalFromLevel = 0;
+
+                    for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 && nullIntervalFromLevel == 0 ; --idxLevel){
+                        while(octreeIterator.getCurrentGlobalIndex() <= workingLimitAtLevel){
+                            if( !octreeIterator.moveRight() ){
+                                // We cannot move right we are not owner of any more cell
+                                nullIntervalFromLevel = idxLevel;
+                                break;
+                            }
+                        }
+                        // If we are responsible for some cells at this level keep the first index
+                        if(nullIntervalFromLevel == 0){
+                            myIntervals[idxLevel].leftIndex = octreeIterator.getCurrentGlobalIndex();
+                            octreeIterator.moveUp();
+                            workingLimitAtLevel >>= 3;
+                        }
+                    }
+                    // In case we are not responsible for any cells we put the leftIndex = rightIndex+1
+                    for(int idxLevel = nullIntervalFromLevel ; idxLevel >= 1 ; --idxLevel){
+                        myIntervals[idxLevel].leftIndex = myIntervals[idxLevel].rightIndex + 1;
+                    }
+                }
+
+                // We get the leftIndex/rightIndex indexes from each procs
+                FMpi::MpiAssert( MPI_Allgather( myIntervals.get(), int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
+                                                workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, fcomCompute.getComm()),  __LINE__ );
+            }
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_enter_event("P2M", EZTRACE_YELLOW);
+            eztrace_enter_event("P2M", EZTRACE_YELLOW);
 #endif
-      Timers[P2MTimer].tic();
-      if(operationsToProceed & FFmmP2M) bottomPass();
-      Timers[P2MTimer].tac();
+            Timers[P2MTimer].tic();
+            if(operationsToProceed & FFmmP2M) bottomPass();
+            Timers[P2MTimer].tac();
 
 #ifdef SSCALFMM_TRACE_ALGO
-      eztrace_leave_event();
-      eztrace_enter_event("M2M", EZTRACE_PINK);
+            eztrace_leave_event();
+            eztrace_enter_event("M2M", EZTRACE_PINK);
 #endif
 
-      Timers[M2MTimer].tic();
-      if(operationsToProceed & FFmmM2M) upwardPass();
-      Timers[M2MTimer].tac();
+            Timers[M2MTimer].tic();
+            if(operationsToProceed & FFmmM2M) upwardPass();
+            Timers[M2MTimer].tac();
 
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_leave_event();
-      eztrace_enter_event("M2L", EZTRACE_GREEN);
+            eztrace_leave_event();
+            eztrace_enter_event("M2L", EZTRACE_GREEN);
 #endif
 
-      Timers[M2LTimer].tic();
-      if(operationsToProceed & FFmmM2L) transferPass();
-      Timers[M2LTimer].tac();
+            Timers[M2LTimer].tic();
+            if(operationsToProceed & FFmmM2L) transferPass();
+            Timers[M2LTimer].tac();
 
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_leave_event();
-      eztrace_enter_event("L2L", EZTRACE_PINK);
+            eztrace_leave_event();
+            eztrace_enter_event("L2L", EZTRACE_PINK);
 #endif
 
-      Timers[L2LTimer].tic();
-      if(operationsToProceed & FFmmL2L) downardPass();
-      Timers[L2LTimer].tac();
+            Timers[L2LTimer].tic();
+            if(operationsToProceed & FFmmL2L) downardPass();
+            Timers[L2LTimer].tac();
 
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_leave_event();
-      eztrace_enter_event("L2P+P2P", EZTRACE_BLUE);
+            eztrace_leave_event();
+            eztrace_enter_event("L2P+P2P", EZTRACE_BLUE);
 #endif
 
-      Timers[NearTimer].tic();
-      if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
-      Timers[NearTimer].tac();
+            Timers[NearTimer].tic();
+            if( (operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P) ) directPass((operationsToProceed & FFmmP2P),(operationsToProceed & FFmmL2P));
+            Timers[NearTimer].tac();
 
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_leave_event();
-      eztrace_stop();
+            eztrace_leave_event();
+            eztrace_stop();
 #endif
-      // delete array
-      delete []     iterArray;
-      delete []     iterArrayComm;
-      iterArray          = nullptr;
-      iterArrayComm = nullptr;
+            // delete array
+            delete []     iterArray;
+            delete []     iterArrayComm;
+            iterArray          = nullptr;
+            iterArrayComm = nullptr;
 #ifdef SCALFMM_TRACE_ALGO
-      eztrace_stop();
+            eztrace_stop();
 #endif
+        }
+        else{
+            FLOG( FLog::Controller << "\tProcess = " << comm.processId() << " has zero particles.\n" );
+        }
     }
-    else{
-      FLOG( FLog::Controller << "\tProcess = " << comm.processId() << " has zero particles.\n" );
-    }
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // P2M
-  /////////////////////////////////////////////////////////////////////////////
-
-  /**
-   * P2M Bottom Pass
-   * No communication are involved in the P2M.
-   * It is similar to multi threaded version.
-   * We traverse alle the octree to set if cells are  sources or targets 
-   */
-  void bottomPass(){
-    FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
-    FLOG(FTic counterTime);
-    FLOG(FTic computationCounter);
-    typename OctreeClass::Iterator octreeIterator(tree);
-
-    // Copy the ptr to leaves in array
-    octreeIterator.gotoBottomLeft();
-    int leafs = 0;
-    do{
-      iterArray[leafs++] = octreeIterator;
-    } while(octreeIterator.moveRight());
-
-    const int chunkSize = FMath::Max(1 , numberOfLeafs/(omp_get_max_threads()*omp_get_max_threads()));
-	
-    FLOG(computationCounter.tic());
+
+    /////////////////////////////////////////////////////////////////////////////
+    // P2M
+    /////////////////////////////////////////////////////////////////////////////
+
+    /**
+     * P2M Bottom Pass
+     * No communication are involved in the P2M.
+     * It is similar to multi threaded version.
+     * We traverse alle the octree to set if cells are  sources or targets
+     */
+    void bottomPass(){
+        FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+        typename OctreeClass::Iterator octreeIterator(tree);
+
+        // Copy the ptr to leaves in array
+        octreeIterator.gotoBottomLeft();
+        int leafs = 0;
+        do{
+            iterArray[leafs++] = octreeIterator;
+        } while(octreeIterator.moveRight());
+
+        const int chunkSize = FMath::Max(1 , numberOfLeafs/(omp_get_max_threads()*omp_get_max_threads()));
+
+        FLOG(computationCounter.tic());
 #pragma omp parallel num_threads(MaxThreads)
-    {
-      // Each thread get its own kernel
-      KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
-      // Parallel iteration on the leaves
+        {
+            // Each thread get its own kernel
+            KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+            // Parallel iteration on the leaves
 #pragma omp for nowait schedule(dynamic, userChunkSize)
-      for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
-	// We need the current cell that represent the leaf
-	// and the list of particles
-	ContainerClass* const sources = iterArray[idxLeafs].getCurrentListSrc();
-	if(sources->getNbParticles()){
-	  iterArray[idxLeafs].getCurrentCell()->setSrcChildTrue();
-	  myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , sources);
-	}
-	if(iterArray[idxLeafs].getCurrentListTargets()->getNbParticles()){
-	  iterArray[idxLeafs].getCurrentCell()->setTargetsChildTrue();
-	}
-      }
-    }
-    FLOG(computationCounter.tac());
-    FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
-    FLOG( FLog::Controller << "\t\t Computation:                " << computationCounter.elapsed() << " s\n" );
-    FLOG( FLog::Controller.flush());
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // Upward
-  /////////////////////////////////////////////////////////////////////////////
-
-  /** M2M */
-  void upwardPass(){
-    FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
-    FLOG(FTic counterTime);
-    FLOG(FTic computationCounter);
-    FLOG(FTic singleCounter);
-    FLOG(FTic parallelCounter);
-
-    // Start from leal level (height-1)
-    typename OctreeClass::Iterator octreeIterator(tree);
-    octreeIterator.gotoBottomLeft();
-    octreeIterator.moveUp();
-
-    for(int idxLevel = OctreeHeight - 2 ; idxLevel > FAbstractAlgorithm::lowerWorkingLevel-1 ; --idxLevel){
-      octreeIterator.moveUp();
+            for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
+                // We need the current cell that represent the leaf
+                // and the list of particles
+                ContainerClass* const sources = iterArray[idxLeafs].getCurrentListSrc();
+                if(sources->getNbParticles()){
+                    iterArray[idxLeafs].getCurrentCell()->setSrcChildTrue();
+                    myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , sources);
+                }
+                if(iterArray[idxLeafs].getCurrentListTargets()->getNbParticles()){
+                    iterArray[idxLeafs].getCurrentCell()->setTargetsChildTrue();
+                }
+            }
+        }
+        FLOG(computationCounter.tac());
+        FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation:                " << computationCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
-    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
-
-    // The proc to send the shared cells to
-    // Starting to the proc on the left this variable will go to 0
-    int currentProcIdToSendTo = (idProcess - 1);
-
-    // There are a maximum of 1 sends and 8-1 receptions
-    MPI_Request requests[8];
-    MPI_Status status[8];
-
-    MPI_Request requestsSize[8];
-    MPI_Status statusSize[8];
-
-    FSize bufferSize;
-    FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7
-    std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
-    FSize recvBufferSize[7];
-    CellClass recvBufferCells[7];
-
-    // The first proc that send to me a cell
-    // This variable will go to nbProcess
-    int firstProcThatSend = idProcess + 1;
-    FLOG(computationCounter.tic());
-
-    // for each levels
-    for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
-      // Does my cells are covered by my neighbors working interval and so I have no more work?
-      const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
-      if(noMoreWorkForMe){
-	FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
-	break;
-      }
-
-      // Copy and count ALL the cells (even the ones outside the working interval)
-      int totalNbCellsAtLevel = 0;
-      do{
-	iterArray[totalNbCellsAtLevel++] = octreeIterator;
-      } while(octreeIterator.moveRight());
-      avoidGotoLeftIterator.moveUp();
-      octreeIterator = avoidGotoLeftIterator;
-
-      int iterMpiRequests       = 0; // The iterator for send/recv requests
-      int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
-
-      int nbCellsToSkip     = 0; // The number of cells to send
-      // Skip all the cells that are out of my working interval
-      while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
-	++nbCellsToSkip;
-      }
-
-      // We need to know if we will recv something in order to know if threads skip the last cell
-      int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
-      bool hasToReceive = false;
-      if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
-	// Find the first proc that may send to me
-	while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
-	  firstProcThatSend += 1;
-	}
-	// Do we have to receive?
-	if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
-	  hasToReceive = true;
-	  // Threads do not compute the last cell, we will do it once data are received
-	  nbCellsForThreads -= 1;
-	}
-      }
-
-      FLOG(parallelCounter.tic());
+    /////////////////////////////////////////////////////////////////////////////
+    // Upward
+    /////////////////////////////////////////////////////////////////////////////
+
+    /** M2M */
+    void upwardPass(){
+        FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+        FLOG(FTic singleCounter);
+        FLOG(FTic parallelCounter);
+
+        // Start from leal level (height-1)
+        typename OctreeClass::Iterator octreeIterator(tree);
+        octreeIterator.gotoBottomLeft();
+        octreeIterator.moveUp();
+
+        for(int idxLevel = OctreeHeight - 2 ; idxLevel > FAbstractAlgorithm::lowerWorkingLevel-1 ; --idxLevel){
+            octreeIterator.moveUp();
+        }
+
+        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+
+        // The proc to send the shared cells to
+        // Starting to the proc on the left this variable will go to 0
+        int currentProcIdToSendTo = (idProcess - 1);
+
+        // There are a maximum of 1 sends and 8-1 receptions
+        MPI_Request requests[8];
+        MPI_Status status[8];
+
+        MPI_Request requestsSize[8];
+        MPI_Status statusSize[8];
+
+        FSize bufferSize;
+        FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7
+        std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
+        FSize recvBufferSize[7];
+        CellClass recvBufferCells[7];
+
+        // The first proc that send to me a cell
+        // This variable will go to nbProcess
+        int firstProcThatSend = idProcess + 1;
+        FLOG(computationCounter.tic());
+
+        // for each levels
+        for(int idxLevel = FMath::Min(OctreeHeight - 2, FAbstractAlgorithm::lowerWorkingLevel - 1) ; idxLevel >= FAbstractAlgorithm::upperWorkingLevel ; --idxLevel ){
+            // Does my cells are covered by my neighbors working interval and so I have no more work?
+            const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
+            if(noMoreWorkForMe){
+                FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
+                break;
+            }
+
+            // Copy and count ALL the cells (even the ones outside the working interval)
+            int totalNbCellsAtLevel = 0;
+            do{
+                iterArray[totalNbCellsAtLevel++] = octreeIterator;
+            } while(octreeIterator.moveRight());
+            avoidGotoLeftIterator.moveUp();
+            octreeIterator = avoidGotoLeftIterator;
+
+            int iterMpiRequests       = 0; // The iterator for send/recv requests
+            int iterMpiRequestsSize   = 0; // The iterator for send/recv requests
+
+            int nbCellsToSkip     = 0; // The number of cells to send
+            // Skip all the cells that are out of my working interval
+            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
+                ++nbCellsToSkip;
+            }
+
+            // We need to know if we will recv something in order to know if threads skip the last cell
+            int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
+            bool hasToReceive = false;
+            if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
+                // Find the first proc that may send to me
+                while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
+                    firstProcThatSend += 1;
+                }
+                // Do we have to receive?
+                if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
+                    hasToReceive = true;
+                    // Threads do not compute the last cell, we will do it once data are received
+                    nbCellsForThreads -= 1;
+                }
+            }
+
+            FLOG(parallelCounter.tic());
 #pragma omp parallel num_threads(MaxThreads)
-      {
-	KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
-	/////////////////////////////////////////////////////////////////////////////////////////
-	//This single section post and receive the comms, and then do the M2M associated with it.
+            {
+                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
+                /////////////////////////////////////////////////////////////////////////////////////////
+                //This single section post and receive the comms, and then do the M2M associated with it.
 #pragma omp single nowait
-	{
-	  FLOG(singleCounter.tic());
-	  // Master proc never send
-	  if(idProcess != 0){
-	    // Skip process that have no work at that level
-	    while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
-	      --currentProcIdToSendTo;
-	    }
-	    // Does the next proc that has work is sharing the parent of my left cell
-	    if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
-	      FAssertLF(nbCellsToSkip != 0);
-
-	      char packageFlags = 0;
-	      sendBuffer.write(packageFlags);
-
-	      // Only the cell the most on the right out of my working interval should be taken in
-	      // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
-	      const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
-	      for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
-		// Check if child exists and has sources  and it was part of my working interval
-		if( child[idxChild] && child[idxChild]->hasSrcChild() && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
-		  // Add the cell to the buffer
-		  child[idxChild]->serializeUp(sendBuffer);
-		  packageFlags = char(packageFlags | (0x1 << idxChild));
-		}
-		if(realChild[idxChild]->hasTargetsChild()){
-		  currentCell->setTargetsChildTrue();
-		}
-	      }
-	      // Add the flag as first value
-	      sendBuffer.writeAt(0,packageFlags);
-	      // Post the message
-	      bufferSize = sendBuffer.getSize();
-	      MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
-			FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
-	      FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
-	      MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo,
-			FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
-	    }
-	  }
-
-	  //Post receive, Datas needed in several parts of the section
-	  int nbProcThatSendToMe = 0;
-
-	  if(hasToReceive){
-	    //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
-	    int idProcSource = firstProcThatSend;
-	    // Find the last proc that should send to me
-	    while( idProcSource < nbProcess
-		   && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
-	      if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
-		MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
-			  idProcSource, FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
-		nbProcThatSendToMe += 1;
-		FAssertLF(nbProcThatSendToMe <= 7);
-	      }
-	      ++idProcSource;
-	    }
-	  }
-
-	  //Wait For the comms, and do the work
-	  // Are we sending or waiting anything?
-	  if(iterMpiRequestsSize){
-	    FAssertLF(iterMpiRequestsSize <= 8);
-	    MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
-	  }
-
-	  if(hasToReceive){
-	    nbProcThatSendToMe = 0;
-	    //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
-	    int idProcSource = firstProcThatSend;
-	    // Find the last proc that should send to me
-	    while( idProcSource < nbProcess
-		   && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
-	      if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
-		recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
-		FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
-		MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE,
-			  idProcSource, FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
-		nbProcThatSendToMe += 1;
-		FAssertLF(nbProcThatSendToMe <= 7);
-	      }
-	      ++idProcSource;
-	    }
-	  }
-
-	  //Wait For the comms, and do the work
-	  // Are we sending or waiting anything?
-	  if(iterMpiRequests){
-	    FAssertLF(iterMpiRequests <= 8);
-	    MPI_Waitall( iterMpiRequests, requests, status);
-	  }
-
-	  // We had received something so we need to proceed the last M2M
-	  if( hasToReceive ){
-	    FAssertLF(iterMpiRequests != 0);
-	    CellClass* currentChild[8];
-	    memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));
-
-	    // Retreive data and merge my child and the child from others
-	    int positionToInsert = 0;
-	    for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
-	      unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>());
-
-	      int position = 0;
-	      while( packageFlags && position < 8){
-		while(!(packageFlags & 0x1)){
-		  packageFlags >>= 1;
-		  ++position;
-		}
-		FAssertLF(positionToInsert < 7);
-		FAssertLF(position < 8);
-		FAssertLF(!currentChild[position], "Already has a cell here");
-		recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
-		currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
-
-		packageFlags     >>= 1;
-		position         += 1;
-		positionToInsert += 1;
-	      }
-
-	      recvBuffer[idxProc].seek(0);
-	    }
-	    //////////////////////////////////////////////////////////////////////////////////////////////////////
-	    //                Finally one thread computes with distant cell
-			
-	    myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel);
-	    firstProcThatSend += nbProcThatSendToMe - 1;
-	  }
-	  // Reset buffer
-	  sendBuffer.reset();
-	  FLOG(singleCounter.tac());
-	}
-	/////////////////////////////////////////////////////////////////////////////////////////
-	//                             End Of Single section
-	////////////////////////////////////////////////////////////////////////////////////////
-	//
-		
-	////////////////////////////////////////////////////////////////////////////////////////
-	//     All threads proceed the M2M with local cells
-	//
-	////////////////////////////////////////////////////////////////////////////////////////
+                {
+                    FLOG(singleCounter.tic());
+                    // Master proc never send
+                    if(idProcess != 0){
+                        // Skip process that have no work at that level
+                        while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo)  ){
+                            --currentProcIdToSendTo;
+                        }
+                        // Does the next proc that has work is sharing the parent of my left cell
+                        if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
+                            FAssertLF(nbCellsToSkip != 0);
+
+                            char packageFlags = 0;
+                            sendBuffer.write(packageFlags);
+
+                            // Only the cell the most on the right out of my working interval should be taken in
+                            // consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
+                            const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
+                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                                // Check if child exists and has sources  and it was part of my working interval
+                                if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
+                                    if( child[idxChild]->hasSrcChild()){
+                                        // Add the cell to the buffer
+                                        sendBuffer << int(1);
+                                    }
+                                    else{
+                                        sendBuffer << int(0);
+                                    }
+                                    if( child[idxChild]->hasTargetsChild()){
+                                        // Add the cell to the buffer
+                                        sendBuffer << int(1);
+                                    }
+                                    else{
+                                        sendBuffer << int(0);
+                                    }
+                                    if( child[idxChild]->hasSrcChild() ){
+                                        child[idxChild]->serializeUp(sendBuffer);
+                                    }
+                                    packageFlags = char(packageFlags | (0x1 << idxChild));
+                                }
+                            }
+                            // Add the flag as first value
+                            sendBuffer.writeAt(0,packageFlags);
+                            // Post the message
+                            bufferSize = sendBuffer.getSize();
+                            MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
+                                      FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
+                            FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
+                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo,
+                                      FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
+                        }
+                    }
+
+                    //Post receive, Datas needed in several parts of the section
+                    int nbProcThatSendToMe = 0;
+
+                    if(hasToReceive){
+                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
+                        int idProcSource = firstProcThatSend;
+                        // Find the last proc that should send to me
+                        while( idProcSource < nbProcess
+                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
+                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
+                                MPI_Irecv(&recvBufferSize[nbProcThatSendToMe], 1, FMpi::GetType(recvBufferSize[nbProcThatSendToMe]),
+                                          idProcSource, FMpi::TagFmmM2MSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterMpiRequestsSize++]);
+                                nbProcThatSendToMe += 1;
+                                FAssertLF(nbProcThatSendToMe <= 7);
+                            }
+                            ++idProcSource;
+                        }
+                    }
+
+                    //Wait For the comms, and do the work
+                    // Are we sending or waiting anything?
+                    if(iterMpiRequestsSize){
+                        FAssertLF(iterMpiRequestsSize <= 8);
+                        MPI_Waitall( iterMpiRequestsSize, requestsSize, statusSize);
+                    }
+
+                    if(hasToReceive){
+                        nbProcThatSendToMe = 0;
+                        //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
+                        int idProcSource = firstProcThatSend;
+                        // Find the last proc that should send to me
+                        while( idProcSource < nbProcess
+                               && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
+                            if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
+                                recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
+                                FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
+                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE,
+                                          idProcSource, FMpi::TagFmmM2M + idxLevel, fcomCompute.getComm(), &requests[iterMpiRequests++]);
+                                nbProcThatSendToMe += 1;
+                                FAssertLF(nbProcThatSendToMe <= 7);
+                            }
+                            ++idProcSource;
+                        }
+                    }
+
+                    //Wait For the comms, and do the work
+                    // Are we sending or waiting anything?
+                    if(iterMpiRequests){
+                        FAssertLF(iterMpiRequests <= 8);
+                        MPI_Waitall( iterMpiRequests, requests, status);
+                    }
+
+                    // We had received something so we need to proceed the last M2M
+                    if( hasToReceive ){
+                        FAssertLF(iterMpiRequests != 0);
+                        CellClass* currentChild[8];
+                        memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));
+
+                        // Retreive data and merge my child and the child from others
+                        int positionToInsert = 0;
+                        int nbRemoteWithTgt = 0;
+                        int nbRemoteWithSrc = 0;
+                        for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
+                            unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>());
+
+                            int position = 0;
+                            while( packageFlags && position < 8){
+                                while(!(packageFlags & 0x1)){
+                                    packageFlags >>= 1;
+                                    ++position;
+                                }
+                                FAssertLF(positionToInsert < 7);
+                                FAssertLF(position < 8);
+                                FAssertLF(!currentChild[position], "Already has a cell here");
+                                int hasSrcChild;
+                                recvBuffer[idxProc] >> hasSrcChild;
+                                FAssert(hasSrcChild == 0 || hasSrcChild == 1);
+                                if(hasSrcChild) nbRemoteWithSrc += 1;
+                                int hasTargChild;
+                                recvBuffer[idxProc] >> hasTargChild;
+                                FAssert(hasTargChild == 0 || hasTargChild == 1);
+                                if(hasTargChild) nbRemoteWithTgt += 1;
+
+                                if(hasSrcChild){
+                                    recvBufferCells[positionToInsert].deserializeUp(recvBuffer[idxProc]);
+                                    currentChild[position] = (CellClass*) &recvBufferCells[positionToInsert];
+                                    position         += 1;
+                                    positionToInsert += 1;
+                                }
+
+                                packageFlags     >>= 1;
+                            }
+
+                            recvBuffer[idxProc].seek(0);
+                        }
+                        //////////////////////////////////////////////////////////////////////////////////////////////////////
+                        //                Finally one thread computes with distant cell
+
+                        if(nbRemoteWithSrc){
+                            iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->setSrcChildTrue();
+                            myThreadkernels->M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel);
+                        }
+                        if(nbRemoteWithTgt){
+                            iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->setTargetsChildTrue();
+                        }
+                        firstProcThatSend += nbProcThatSendToMe - 1;
+                    }
+                    // Reset buffer
+                    sendBuffer.reset();
+                    FLOG(singleCounter.tac());
+                }
+                /////////////////////////////////////////////////////////////////////////////////////////
+                //                             End Of Single section
+                ////////////////////////////////////////////////////////////////////////////////////////
+                //
+
+                ////////////////////////////////////////////////////////////////////////////////////////
+                //     All threads proceed the M2M with local cells
+                //
+                ////////////////////////////////////////////////////////////////////////////////////////
 #pragma omp for nowait schedule(dynamic, userChunkSize)
-	for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
-	  // We need the current cell and the child
-	  // child is an array (of 8 child) that may be null
-	  CellClass* potentialChild[8];
-	  CellClass** const realChild   = iterArray[idxCell].getCurrentChild();
-	  CellClass*  const currentCell = iterArray[idxCell].getCurrentCell();
-	  for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
-	    potentialChild[idxChild] = nullptr;
-	    if(realChild[idxChild]){
-	      if(realChild[idxChild]->hasSrcChild()){
-		currentCell->setSrcChildTrue();
-		potentialChild[idxChild] = realChild[idxChild];
-	      }
-	      if(realChild[idxChild]->hasTargetsChild()){
-		currentCell->setTargetsChildTrue();
-	      }
-	    }
-	  }
-	  myThreadkernels->M2M( currentCell , potentialChild, idxLevel);
-	}
-	////////////////////////////////////////////////////////////////////////////////////////
-		
-      }//End of parallel section
-      FLOG(parallelCounter.tac());
+                for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
+                    // We need the current cell and the child
+                    // child is an array (of 8 child) that may be null
+                    CellClass* potentialChild[8];
+                    CellClass** const realChild   = iterArray[idxCell].getCurrentChild();
+                    CellClass*  const currentCell = iterArray[idxCell].getCurrentCell();
+                    int nbChildWithSrc = 0;
+                    for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                        potentialChild[idxChild] = nullptr;
+                        if(realChild[idxChild]){
+                            if(realChild[idxChild]->hasSrcChild()){
+                                nbChildWithSrc += 1;
+                                potentialChild[idxChild] = realChild[idxChild];
+                            }
+                            if(realChild[idxChild]->hasTargetsChild()){
+                                currentCell->setTargetsChildTrue();
+                            }
+                        }
+                    }
+                    if(nbChildWithSrc){
+                        currentCell->setSrcChildTrue();
+                        myThreadkernels->M2M( currentCell , potentialChild, idxLevel);
+                    }
+                }
+                ////////////////////////////////////////////////////////////////////////////////////////
+
+            }//End of parallel section
+            FLOG(parallelCounter.tac());
+        }
+
+        FLOG(counterTime.tac());
+        FLOG(computationCounter.tac());
+        FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
-    FLOG(counterTime.tac());
-    FLOG(computationCounter.tac());
-    FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.elapsed() << " s)\n" );
-    FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
-    FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
-    FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
-    FLOG( FLog::Controller.flush());
-  }
-
-  /////////////////////////////////////////////////////////////////////////////
-  // Downard
-  /////////////////////////////////////////////////////////////////////////////
-
-
-  void transferPass(){
-    FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
-    FLOG(FTic counterTime);
-    FLOG(FTic computationCounter);
-    FLOG(FTic sendCounter);
-    FLOG(FTic receiveCounter);
-    FLOG(FTic prepareCounter);
-    FLOG(FTic gatherCounter);
+    /////////////////////////////////////////////////////////////////////////////
+    // Downard
+    /////////////////////////////////////////////////////////////////////////////
 
-    //////////////////////////////////////////////////////////////////
-    // First know what to send to who
-    //////////////////////////////////////////////////////////////////
 
-    // pointer to send
-    std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
-    // index
-    long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
-    memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
-    // To know which one has need someone
-    FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
-    memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);
+    void transferPass(){
+        FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+        FLOG(FTic sendCounter);
+        FLOG(FTic receiveCounter);
+        FLOG(FTic prepareCounter);
+        FLOG(FTic gatherCounter);
+
+        //////////////////////////////////////////////////////////////////
+        // First know what to send to who
+        //////////////////////////////////////////////////////////////////
 
-    // All process say to each others
-    // what the will send to who
-    long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
-    memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
+        // pointer to send
+        std::unique_ptr<FVector<typename OctreeClass::Iterator>[]> toSend(new FVector<typename OctreeClass::Iterator>[nbProcess * OctreeHeight]);
+        // index
+        long long int*const indexToSend = new long long int[nbProcess * OctreeHeight];
+        memset(indexToSend, 0, sizeof(long long int) * nbProcess * OctreeHeight);
+        // To know which one has need someone
+        FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
+        memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);
 
-    FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
-    memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);
+        // All process say to each others
+        // what the will send to who
+        long long int*const globalReceiveMap = new long long  int[nbProcess * nbProcess * OctreeHeight];
+        memset(globalReceiveMap, 0, sizeof(long long  int) * nbProcess * nbProcess * OctreeHeight);
 
-    FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
-    memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
+        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
+        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);
+
+        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
+        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
 
 #pragma omp parallel num_threads(MaxThreads)
-    {
+        {
 #pragma omp master
-      {
-	{
-	  FLOG(prepareCounter.tic());
-
-	  std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);
-
-	  // To know if a leaf has been already sent to a proc
-	  bool*const alreadySent = new bool[nbProcess];
-	  memset(alreadySent, 0, sizeof(bool) * nbProcess);
-
-	  typename OctreeClass::Iterator octreeIterator(tree);
-	  octreeIterator.moveDown();
-
-	  for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
-	    octreeIterator.moveDown();
-	  }
-
-	  typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
-	  // for each levels
-	  for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
-
-	    const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
-
-	    if(!procHasWorkAtLevel(idxLevel, idProcess)){
-	      avoidGotoLeftIterator.moveDown();
-	      octreeIterator = avoidGotoLeftIterator;
-	      continue;
-	    }
-
-	    int numberOfCells = 0;
-
-	    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
-	      octreeIterator.moveRight();
-	    }
-
-	    // for each cells
-	    do{
-	      iterArrayLocal[numberOfCells] = octreeIterator;
-	      ++numberOfCells;
-	    } while(octreeIterator.moveRight());
-	    avoidGotoLeftIterator.moveDown();
-	    octreeIterator = avoidGotoLeftIterator;
-
-	    leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);
-
-	    // Which cell potentialy needs other data and in the same time
-	    // are potentialy needed by other
-	    MortonIndex neighborsIndexes[/*189+26+1*/216];
-	    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
-	      // Find the M2L neigbors of a cell
-	      const int counter = iterArrayLocal[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes, separationCriteria);
-
-	      memset(alreadySent, false, sizeof(bool) * nbProcess);
-	      bool needOther = false;
-	      // Test each negibors to know which one do not belong to us
-	      for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
-		if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
-		   || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
-		  int procToReceive = idProcess;
-		  while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
-		    --procToReceive;
-		  }
-		  while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
-		    ++procToReceive;
-		  }
-		  // Maybe already sent to that proc?
-		  if( !alreadySent[procToReceive]
-		      && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
-		      && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){
-
-		    alreadySent[procToReceive] = true;
-
-		    needOther = true;
-
-		    toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
-		    if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
-		      indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
-		    }
-		    indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
-		    indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
-		    indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
-		  }
-		}
-	      }
-	      if(needOther){
-		leafsNeedOther[idxLevel]->set(idxCell,true);
-	      }
-	    }
-	  }
-	  FLOG(prepareCounter.tac());
-
-	  delete[] alreadySent;
-	}
-
-	//////////////////////////////////////////////////////////////////
-	// Gather this information
-	//////////////////////////////////////////////////////////////////
-
-	FLOG(gatherCounter.tic());
-	FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, fcomCompute.getComm()),  __LINE__ );
-	FLOG(gatherCounter.tac());
-
-	//////////////////////////////////////////////////////////////////
-	// Send and receive for real
-	//////////////////////////////////////////////////////////////////
-
-	FLOG(sendCounter.tic());
-	// Then they can send and receive (because they know what they will receive)
-	// To send in asynchrone way
-	std::vector<MPI_Request> requests;
-	requests.reserve(2 * nbProcess * OctreeHeight);
-
-	for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
-	    if(toSendAtProcAtLevel != 0){
-	      sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel);
-
-	      sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));
-
-	      for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
-		const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
-		sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
-		const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
-		sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
-		toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
-	      }
-
-	      FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);
-
-	      FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(),
-			       sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc,
-			       FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
-	    }
-
-	    const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
-	    if(toReceiveFromProcAtLevel){
-	      recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel);
-
-	      FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
-			       recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc,
-			       FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
-	    }
-	  }
-	}
-
-	//////////////////////////////////////////////////////////////////
-	// Wait received data and compute
-	//////////////////////////////////////////////////////////////////
-
-	// Wait to receive every things (and send every things)
-	FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__);
-
-	FLOG(sendCounter.tac());
-      }//End of Master region
-
-      //////////////////////////////////////////////////////////////////
-      // Do M2L
-      //////////////////////////////////////////////////////////////////
+            {
+                {
+                    FLOG(prepareCounter.tic());
+
+                    std::unique_ptr<typename OctreeClass::Iterator[]> iterArrayLocal(new typename OctreeClass::Iterator[numberOfLeafs]);
+
+                    // To know if a leaf has been already sent to a proc
+                    bool*const alreadySent = new bool[nbProcess];
+                    memset(alreadySent, 0, sizeof(bool) * nbProcess);
+
+                    typename OctreeClass::Iterator octreeIterator(tree);
+                    octreeIterator.moveDown();
+
+                    for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
+                        octreeIterator.moveDown();
+                    }
+
+                    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+                    // for each levels
+                    for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
+
+                        const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
+
+                        if(!procHasWorkAtLevel(idxLevel, idProcess)){
+                            avoidGotoLeftIterator.moveDown();
+                            octreeIterator = avoidGotoLeftIterator;
+                            continue;
+                        }
+
+                        int numberOfCells = 0;
+
+                        while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
+                            octreeIterator.moveRight();
+                        }
+
+                        // for each cells
+                        do{
+                            iterArrayLocal[numberOfCells] = octreeIterator;
+                            ++numberOfCells;
+                        } while(octreeIterator.moveRight());
+                        avoidGotoLeftIterator.moveDown();
+                        octreeIterator = avoidGotoLeftIterator;
+
+                        leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);
+
+                        // Which cell potentialy needs other data and in the same time
+                        // are potentialy needed by other
+                        MortonIndex neighborsIndexes[/*189+26+1*/216];
+                        for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
+                            // Find the M2L neigbors of a cell
+                            const int counter = iterArrayLocal[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes, separationCriteria);
+
+                            memset(alreadySent, false, sizeof(bool) * nbProcess);
+                            bool needOther = false;
+                            // Test each negibors to know which one do not belong to us
+                            for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+                                if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
+                                        || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
+                                    int procToReceive = idProcess;
+                                    while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
+                                        --procToReceive;
+                                    }
+                                    while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
+                                        ++procToReceive;
+                                    }
+                                    // Maybe already sent to that proc?
+                                    if( !alreadySent[procToReceive]
+                                            && getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
+                                            && neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){
+
+                                        needOther = true;
+
+                                        if(iterArrayLocal[idxCell].getCurrentCell()->hasSrcChild()){
+                                            alreadySent[procToReceive] = true;
+
+
+                                            toSend[idxLevel * nbProcess + procToReceive].push(iterArrayLocal[idxCell]);
+                                            if(indexToSend[idxLevel * nbProcess + procToReceive] == 0){
+                                                indexToSend[idxLevel * nbProcess + procToReceive] = sizeof(int);
+                                            }
+                                            indexToSend[idxLevel * nbProcess + procToReceive] += iterArrayLocal[idxCell].getCurrentCell()->getSavedSizeUp();
+                                            indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(MortonIndex);
+                                            indexToSend[idxLevel * nbProcess + procToReceive] += sizeof(FSize);
+                                        }
+                                        else{
+                                            // We have not sources so we will not send it,
+                                            // but we need to know if needOther is true
+                                            break;
+                                        }
+                                    }
+                                }
+                            }
+                            if(needOther && iterArrayLocal[idxCell].getCurrentCell()->hasTargetsChild()){
+                                // only keep those with targets in the list
+                                leafsNeedOther[idxLevel]->set(idxCell,true);
+                            }
+                        }
+                    }
+                    FLOG(prepareCounter.tac());
+
+                    delete[] alreadySent;
+                }
+
+                //////////////////////////////////////////////////////////////////
+                // Gather this information
+                //////////////////////////////////////////////////////////////////
+
+                FLOG(gatherCounter.tic());
+                FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_LONG_LONG_INT, fcomCompute.getComm()),  __LINE__ );
+                FLOG(gatherCounter.tac());
+
+                //////////////////////////////////////////////////////////////////
+                // Send and receive for real
+                //////////////////////////////////////////////////////////////////
+
+                FLOG(sendCounter.tic());
+                // Then they can send and receive (because they know what they will receive)
+                // To send in asynchrone way
+                std::vector<MPI_Request> requests;
+                requests.reserve(2 * nbProcess * OctreeHeight);
+
+                for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
+                    for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                        const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
+                        if(toSendAtProcAtLevel != 0){
+                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel);
+
+                            sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));
+
+                            for(int idxLeaf = 0 ; idxLeaf < toSend[idxLevel * nbProcess + idxProc].getSize(); ++idxLeaf){
+                                const FSize currentTell = sendBuffer[idxLevel * nbProcess + idxProc]->getSize();
+                                sendBuffer[idxLevel * nbProcess + idxProc]->write(currentTell);
+                                const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
+                                sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
+                                toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
+                            }
+
+                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);
+
+                            FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(),
+                                    sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc,
+                                    FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
+                        }
+
+                        const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
+                        if(toReceiveFromProcAtLevel){
+                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel);
+
+                            FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
+                                    recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc,
+                                    FMpi::TagLast + idxLevel*100, fcomCompute, &requests);
+                        }
+                    }
+                }
+
+                //////////////////////////////////////////////////////////////////
+                // Wait received data and compute
+                //////////////////////////////////////////////////////////////////
+
+                // Wait to receive every things (and send every things)
+                FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__);
+
+                FLOG(sendCounter.tac());
+            }//End of Master region
+
+            //////////////////////////////////////////////////////////////////
+            // Do M2L
+            //////////////////////////////////////////////////////////////////
 
 #pragma omp single nowait
-      {
-	typename OctreeClass::Iterator octreeIterator(tree);
-	octreeIterator.moveDown();
-
-	for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
-	  octreeIterator.moveDown();
-	}
-
-	typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
-	// Now we can compute all the data
-	// for each levels
-	for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
-	  const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
-
-	  if(!procHasWorkAtLevel(idxLevel, idProcess)){
-	    avoidGotoLeftIterator.moveDown();
-	    octreeIterator = avoidGotoLeftIterator;
-	    continue;
-	  }
-
-	  int numberOfCells = 0;
-	  while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
-	    octreeIterator.moveRight();
-	  }
-	  // for each cells
-	  do{
-	    iterArray[numberOfCells] = octreeIterator;
-	    ++numberOfCells;
-	  } while(octreeIterator.moveRight());
-	  avoidGotoLeftIterator.moveDown();
-	  octreeIterator = avoidGotoLeftIterator;
-
-	  FLOG(computationCounter.tic());
-	  {
-	    const int chunckSize = userChunkSize;
-	    for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
+            {
+                typename OctreeClass::Iterator octreeIterator(tree);
+                octreeIterator.moveDown();
+
+                for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
+                    octreeIterator.moveDown();
+                }
+
+                typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+                // Now we can compute all the data
+                // for each levels
+                for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
+                    const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
+
+                    if(!procHasWorkAtLevel(idxLevel, idProcess)){
+                        avoidGotoLeftIterator.moveDown();
+                        octreeIterator = avoidGotoLeftIterator;
+                        continue;
+                    }
+
+                    int numberOfCells = 0;
+                    while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
+                        octreeIterator.moveRight();
+                    }
+                    // for each cells
+                    do{
+                        iterArray[numberOfCells] = octreeIterator;
+                        ++numberOfCells;
+                    } while(octreeIterator.moveRight());
+                    avoidGotoLeftIterator.moveDown();
+                    octreeIterator = avoidGotoLeftIterator;
+
+                    FLOG(computationCounter.tic());
+                    {
+                        const int chunckSize = userChunkSize;
+                        for(int idxCell = 0 ; idxCell < numberOfCells ; idxCell += chunckSize){
 #pragma omp task default(none) shared(numberOfCells,idxLevel) firstprivate(idxCell) //+ shared(chunckSize)
-	      {
-	      KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
-	      const CellClass* neighbors[342];
-	      int neighborPositions[342];
-
-	      const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
-	      for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
-		const int counter = tree->getInteractionNeighbors(neighbors,  neighborPositions, iterArray[idxCellToCompute].getCurrentGlobalCoordinate(), idxLevel, separationCriteria);
-		if(counter) myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
-	      }
-	    }
-	  }
-	}//End of task spawning
+                            {
+                                KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                                const CellClass* neighbors[342];
+                                int neighborPositions[342];
+
+                                const int nbCellToCompute = FMath::Min(chunckSize, numberOfCells-idxCell);
+                                for(int idxCellToCompute = idxCell ; idxCellToCompute < idxCell+nbCellToCompute ; ++idxCellToCompute){
+                                    if(iterArray[idxCellToCompute].getCurrentCell()->hasTargetsChild()){
+                                        const int counter = tree->getInteractionNeighbors(neighbors,  neighborPositions, iterArray[idxCellToCompute].getCurrentGlobalCoordinate(), idxLevel, separationCriteria);
+                                        if(counter){
+                                            int counterWithSrc = 0;
+                                            for(int idxRealNeighbors = 0 ; idxRealNeighbors < counter ; ++idxRealNeighbors ){
+                                                if(neighbors[idxRealNeighbors]->hasSrcChild()){
+                                                    neighbors[counterWithSrc] = neighbors[idxRealNeighbors];
+                                                    neighborPositions[counterWithSrc] = neighborPositions[idxRealNeighbors];
+                                                    ++counterWithSrc;
+                                                }
+                                            }
+                                            myThreadkernels->M2L( iterArray[idxCellToCompute].getCurrentCell() , neighbors, neighborPositions, counterWithSrc, idxLevel);
+                                        }
+                                    }
+                                }
+                            }
+                        }
+                    }//End of task spawning
 
 #pragma omp taskwait
 
-	  for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
+                    for(int idxThread = 0 ; idxThread < omp_get_num_threads() ; ++idxThread){
 #pragma omp task  default(none) firstprivate(idxThread,idxLevel)
-	    {
-	      kernels[idxThread]->finishedLevelM2L(idxLevel);
-	    }
-	  }
+                        {
+                            kernels[idxThread]->finishedLevelM2L(idxLevel);
+                        }
+                    }
 #pragma omp taskwait
 
-	  FLOG(computationCounter.tac());
-      }
-    }
-  }
-
-
-  {
-    FLOG(receiveCounter.tic());
-    typename OctreeClass::Iterator octreeIterator(tree);
-    octreeIterator.moveDown();
-
-    for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
-      octreeIterator.moveDown();
-    }
-
-    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
-    // compute the second time
-    // for each levels
-    for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
-      const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
-
-      if(!procHasWorkAtLevel(idxLevel, idProcess)){
-	avoidGotoLeftIterator.moveDown();
-	octreeIterator = avoidGotoLeftIterator;
-	continue;
-      }
-
-      // put the received data into a temporary tree
-      FLightOctree<CellClass> tempTree;
-      for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	if(recvBuffer[idxLevel * nbProcess + idxProc]){
-	  const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();
-
-	  for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
-	    const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
-	    const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
-	    FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);
-
-	    const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();
-
-	    CellClass* const newCell = new CellClass;
-	    newCell->setMortonIndex(cellIndex);
-	    newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);
-
-	    tempTree.insertCell(cellIndex, idxLevel, newCell);
-	  }
-
-	  FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
-		    recvBuffer[idxLevel * nbProcess + idxProc]->tell());
-	}
-      }
-
-      // take cells from our octree only if they are
-      // linked to received data
-      int numberOfCells = 0;
-      int realCellId = 0;
-
-      while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
-	octreeIterator.moveRight();
-      }
-      // for each cells
-      do{
-	// copy cells that need data from others
-	if(leafsNeedOther[idxLevel]->get(realCellId++)){
-	  iterArray[numberOfCells++] = octreeIterator;
-	}
-      } while(octreeIterator.moveRight());
-      avoidGotoLeftIterator.moveDown();
-      octreeIterator = avoidGotoLeftIterator;
-
-      delete leafsNeedOther[idxLevel];
-      leafsNeedOther[idxLevel] = nullptr;
-
-      // Compute this cells
-      FLOG(computationCounter.tic());
+                    FLOG(computationCounter.tac());
+                }
+            }
+        }
+
+
+        {
+            FLOG(receiveCounter.tic());
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.moveDown();
+
+            for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
+                octreeIterator.moveDown();
+            }
+
+            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+            // compute the second time
+            // for each levels
+            for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < FAbstractAlgorithm::lowerWorkingLevel ; ++idxLevel ){
+                const int separationCriteria = (idxLevel != FAbstractAlgorithm::lowerWorkingLevel-1 ? 1 : leafLevelSeparationCriteria);
+
+                if(!procHasWorkAtLevel(idxLevel, idProcess)){
+                    avoidGotoLeftIterator.moveDown();
+                    octreeIterator = avoidGotoLeftIterator;
+                    continue;
+                }
+
+                // put the received data into a temporary tree
+                FLightOctree<CellClass> tempTree;
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(recvBuffer[idxLevel * nbProcess + idxProc]){
+                        const int toReceiveFromProcAtLevel = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<int>();
+
+                        for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
+                            const FSize currentTell = recvBuffer[idxLevel * nbProcess + idxProc]->tell();
+                            const FSize verifCurrentTell = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<FSize>();
+                            FAssertLF(currentTell == verifCurrentTell, currentTell, " ", verifCurrentTell);
+
+                            const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->template getValue<MortonIndex>();
+
+                            CellClass* const newCell = new CellClass;
+                            newCell->setMortonIndex(cellIndex);
+                            newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);
+
+                            tempTree.insertCell(cellIndex, idxLevel, newCell);
+                        }
+
+                        FAssertLF(globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess] ==
+                                recvBuffer[idxLevel * nbProcess + idxProc]->tell());
+                    }
+                }
+
+                // take cells from our octree only if they are
+                // linked to received data
+                int numberOfCells = 0;
+                int realCellId = 0;
+
+                while(octreeIterator.getCurrentGlobalIndex() <  getWorkingInterval(idxLevel , idProcess).leftIndex){
+                    octreeIterator.moveRight();
+                }
+                // for each cells
+                do{
+                    // copy cells that need data from others
+                    if(leafsNeedOther[idxLevel]->get(realCellId++)){
+                        iterArray[numberOfCells++] = octreeIterator;
+                    }
+                } while(octreeIterator.moveRight());
+                avoidGotoLeftIterator.moveDown();
+                octreeIterator = avoidGotoLeftIterator;
+
+                delete leafsNeedOther[idxLevel];
+                leafsNeedOther[idxLevel] = nullptr;
+
+                // Compute this cells
+                FLOG(computationCounter.tic());
 #pragma omp parallel num_threads(MaxThreads)
-      {
-	KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
-	MortonIndex neighborsIndex[/*189+26+1*/216];
-	int neighborsPosition[/*189+26+1*/216];
-	const CellClass* neighbors[342];
-	int neighborPositions[342];
+                {
+                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                    MortonIndex neighborsIndex[/*189+26+1*/216];
+                    int neighborsPosition[/*189+26+1*/216];
+                    const CellClass* neighbors[342];
+                    int neighborPositions[342];
 
 #pragma omp for  schedule(dynamic, userChunkSize) nowait
-	for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
-	  // compute indexes
-	  const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition, separationCriteria);
-
-	  int counter = 0;
-	  // does we receive this index from someone?
-	  for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
-	    if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
-	       || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){
-
-	      CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);
-
-	      if(otherCell){
-		neighbors[counter] = otherCell;
-		neighborPositions[counter] = neighborsPosition[idxNeig];
-		++counter;
-	      }
-	    }
-	  }
-	  // need to compute
-	  if(counter){
-	    myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
-	  }
-	}
-
-	myThreadkernels->finishedLevelM2L(idxLevel);
-      }
-      FLOG(computationCounter.tac());
-    }
-    FLOG(receiveCounter.tac());
-  }
-
-  for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
-    delete sendBuffer[idxComm];
-    delete recvBuffer[idxComm];
-  }
-  for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
-    delete leafsNeedOther[idxComm];
-  }
-  delete[] sendBuffer;
-  delete[] recvBuffer;
-  delete[] indexToSend;
-  delete[] leafsNeedOther;
-  delete[] globalReceiveMap;
-
-
-  FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
-  FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
-  FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
-  FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
-  FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
-  FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
-  FLOG( FLog::Controller.flush());
-
-}
-
-  //////////////////////////////////////////////////////////////////
-  // ---------------- L2L ---------------
-  //////////////////////////////////////////////////////////////////
-
-  void downardPass(){ // second L2L
-    FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
-    FLOG(FTic counterTime);
-    FLOG(FTic computationCounter);
-    FLOG(FTic prepareCounter);
-    FLOG(FTic waitCounter);
-
-    // Start from leal level - 1
-    typename OctreeClass::Iterator octreeIterator(tree);
-    octreeIterator.moveDown();
-
-    for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
-      octreeIterator.moveDown();
+                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
+                        // compute indexes
+                        const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition, separationCriteria);
+
+                        FAssertLF(iterArray[idxCell].getCurrentCell()->hasTargetsChild());
+
+                        int counter = 0;
+                        // does we receive this index from someone?
+                        for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
+                            if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
+                                    || (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){
+
+                                CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);
+
+                                if(otherCell){
+                                    neighbors[counter] = otherCell;
+                                    neighborPositions[counter] = neighborsPosition[idxNeig];
+                                    ++counter;
+                                }
+                            }
+                        }
+                        // need to compute
+                        if(counter){
+                            myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, neighborPositions, counter, idxLevel);
+                        }
+                    }
+
+                    myThreadkernels->finishedLevelM2L(idxLevel);
+                }
+                FLOG(computationCounter.tac());
+            }
+            FLOG(receiveCounter.tac());
+        }
+
+        for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
+            delete sendBuffer[idxComm];
+            delete recvBuffer[idxComm];
+        }
+        for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
+            delete leafsNeedOther[idxComm];
+        }
+        delete[] sendBuffer;
+        delete[] recvBuffer;
+        delete[] indexToSend;
+        delete[] leafsNeedOther;
+        delete[] globalReceiveMap;
+
+
+        FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Send : " << sendCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
+
     }
 
-    typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
-
-    // Max 1 receive and 7 send (but 7 times the same data)
-    MPI_Request*const requests     = new MPI_Request[8];
-    MPI_Request*const requestsSize = new MPI_Request[8];
-
-    const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
-
-    FMpiBufferWriter sendBuffer;
-    FMpiBufferReader recvBuffer;
-
-    int righestProcToSendTo   = nbProcess - 1;
-
-    // for each levels exepted leaf level
-    for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < heightMinusOne ; ++idxLevel ){
-      // If nothing to do in the next level skip the current one
-      if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
-	avoidGotoLeftIterator.moveDown();
-	octreeIterator = avoidGotoLeftIterator;
-	continue;
-      }
-
-      // Copy all the cells in an array even the one that are out of my working interval
-      int totalNbCellsAtLevel = 0;
-      do{
-	iterArray[totalNbCellsAtLevel++] = octreeIterator;
-      } while(octreeIterator.moveRight());
-      avoidGotoLeftIterator.moveDown();
-      octreeIterator = avoidGotoLeftIterator;
-
-      // Count the number of cells that are out of my working interval
-      int nbCellsToSkip = 0;
-      while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
-	nbCellsToSkip += 1;
-      }
-
-      // Check if someone will send a cell to me
-      bool hasToReceive    = false;
-      int idxProcToReceive = idProcess - 1;
-      if(idProcess != 0 && nbCellsToSkip){
-	// Starting from my left neighbor stop at the first proc that has work to do (not null interval)
-	while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
-	  idxProcToReceive -= 1;
-	}
-	// Check if we find such a proc and that it share a cell with us on the border
-	if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
-	  hasToReceive = true;
-	}
-      }
+    //////////////////////////////////////////////////////////////////
+    // ---------------- L2L ---------------
+    //////////////////////////////////////////////////////////////////
+
+    void downardPass(){ // second L2L
+        FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
+        FLOG(FTic counterTime);
+        FLOG(FTic computationCounter);
+        FLOG(FTic prepareCounter);
+        FLOG(FTic waitCounter);
+
+        // Start from leal level - 1
+        typename OctreeClass::Iterator octreeIterator(tree);
+        octreeIterator.moveDown();
+
+        for(int idxLevel = 2 ; idxLevel < FAbstractAlgorithm::upperWorkingLevel ; ++idxLevel){
+            octreeIterator.moveDown();
+        }
+
+        typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+
+        // Max 1 receive and 7 send (but 7 times the same data)
+        MPI_Request*const requests     = new MPI_Request[8];
+        MPI_Request*const requestsSize = new MPI_Request[8];
+
+        const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
+
+        FMpiBufferWriter sendBuffer;
+        FMpiBufferReader recvBuffer;
+
+        int righestProcToSendTo   = nbProcess - 1;
+
+        // for each levels exepted leaf level
+        for(int idxLevel = FAbstractAlgorithm::upperWorkingLevel ; idxLevel < heightMinusOne ; ++idxLevel ){
+            // If nothing to do in the next level skip the current one
+            if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
+                avoidGotoLeftIterator.moveDown();
+                octreeIterator = avoidGotoLeftIterator;
+                continue;
+            }
+
+            // Copy all the cells in an array even the one that are out of my working interval
+            int totalNbCellsAtLevel = 0;
+            do{
+                iterArray[totalNbCellsAtLevel++] = octreeIterator;
+            } while(octreeIterator.moveRight());
+            avoidGotoLeftIterator.moveDown();
+            octreeIterator = avoidGotoLeftIterator;
+
+            // Count the number of cells that are out of my working interval
+            int nbCellsToSkip = 0;
+            while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
+                nbCellsToSkip += 1;
+            }
+
+            // Check if someone will send a cell to me
+            bool hasToReceive    = false;
+            int idxProcToReceive = idProcess - 1;
+            if(idProcess != 0 && nbCellsToSkip){
+                // Starting from my left neighbor stop at the first proc that has work to do (not null interval)
+                while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
+                    idxProcToReceive -= 1;
+                }
+                // Check if we find such a proc and that it share a cell with us on the border
+                if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
+                    hasToReceive = true;
+                }
+            }
 
 #pragma omp parallel num_threads(MaxThreads)
-      {
-	KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
+            {
+                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
 #pragma omp single nowait
-	{
-	  FLOG(prepareCounter.tic());
-	  int iterRequests = 0;
-	  int iterRequestsSize = 0;
-	  FSize recvBufferSize = 0;
-	  FSize sendBufferSize;
-	  // Post the receive
-	  if(hasToReceive){
-	    FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive,
-					FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
-	  }
-
-	  // We have to be sure that we are not sending if we have no work in the current level
-	  if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
-	    int idxProcSend = idProcess + 1;
-	    int nbMessageSent = 0;
-	    // From the proc on the right to righestProcToSendTo, check if we have to send something
-	    while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
-	      // We know that if the proc has work at the next level it share a cell with us due to the while condition
-	      if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
-		FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
-		// If first message then serialize the cell to send
-		if( nbMessageSent == 0 ){
-		  // We send our last cell
-		  iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
-		  sendBufferSize = sendBuffer.getSize();
-		}
-		// Post the send message
-		FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
-					   FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
-		FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
-		FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, idxProcSend,
-					   FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__);
-		// Inc and check the counter
-		nbMessageSent += 1;
-		FAssertLF(nbMessageSent <= 7);
-	      }
-	      idxProcSend += 1;
-	    }
-	    // Next time we will not need to go further than idxProcSend
-	    righestProcToSendTo = idxProcSend;
-	  }
-
-	  // Finalize the communication
-	  if(iterRequestsSize){
-	    FLOG(waitCounter.tic());
-	    FAssertLF(iterRequestsSize <= 8);
-	    FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, MPI_STATUSES_IGNORE), __LINE__);
-	    FLOG(waitCounter.tac());
-	  }
-
-	  if(hasToReceive){
-	    recvBuffer.cleanAndResize(recvBufferSize);
-	    FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
-	    FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_BYTE, idxProcToReceive,
-					FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__ );
-	  }
-
-	  if(iterRequests){
-	    FLOG(waitCounter.tic());
-	    FAssertLF(iterRequests <= 8);
-	    FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE), __LINE__);
-	    FLOG(waitCounter.tac());
-	  }
-
-	  // If we receive something proceed the L2L
-	  if(hasToReceive){
-	    FAssertLF(iterRequests != 0);
-	    // In this case we know that we have to perform the L2L with the last cell that are
-	    // exclude from our working interval nbCellsToSkip-1
-	    iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
-	    if( iterArray[nbCellsToSkip-1].getCurrentCell()->hasTargetsChild() ){
-	      CellClass* potentialChild[8];
-	      CellClass** const realChild   = iterArray[nbCellsToSkip-1].getCurrentChild();
-	      CellClass* const currentCell  = iterArray[nbCellsToSkip-1].getCurrentCell();
-	      for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
-		if(realChild[idxChild] && realChild[idxChild]->hasTargetsChild()){
-		  potentialChild[idxChild] = realChild[idxChild];
-		}
-		else{
-		  potentialChild[idxChild] = nullptr;
-		}
-	      }
-	      myThreadkernels->L2L( currentCell , potentialChild, idxLevel);     
-	    }
-	    //		      myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), idxLevel);
-	  }
-	  FLOG(prepareCounter.tac());
-	}
+                {
+                    FLOG(prepareCounter.tic());
+                    int iterRequests = 0;
+                    int iterRequestsSize = 0;
+                    FSize recvBufferSize = 0;
+                    FSize sendBufferSize;
+                    // Post the receive
+                    if(hasToReceive){
+                        FMpi::MpiAssert( MPI_Irecv( &recvBufferSize, 1, FMpi::GetType(recvBufferSize), idxProcToReceive,
+                                                    FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__ );
+                    }
+
+                    // We have to be sure that we are not sending if we have no work in the current level
+                    if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo && procHasWorkAtLevel(idxLevel, idProcess)){
+                        int idxProcSend = idProcess + 1;
+                        int nbMessageSent = 0;
+                        // From the proc on the right to righestProcToSendTo, check if we have to send something
+                        while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
+                            // We know that if the proc has work at the next level it share a cell with us due to the while condition
+                            if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
+                                FAssertLF(procCoversMyRightBorderCell(idxLevel, idxProcSend));
+                                // If first message then serialize the cell to send
+                                if( nbMessageSent == 0 ){
+                                    if(iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->hasTargetsChild()){
+                                        sendBuffer << int(1);
+                                        // We send our last cell
+                                        iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
+                                    }
+                                    else{
+                                        sendBuffer << int(0);
+                                    }
+                                    sendBufferSize = sendBuffer.getSize();
+                                }
+                                // Post the send message
+                                FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
+                                                           FMpi::TagFmmL2LSize + idxLevel, fcomCompute.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
+                                FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
+                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, idxProcSend,
+                                                           FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__);
+                                // Inc and check the counter
+                                nbMessageSent += 1;
+                                FAssertLF(nbMessageSent <= 7);
+                            }
+                            idxProcSend += 1;
+                        }
+                        // Next time we will not need to go further than idxProcSend
+                        righestProcToSendTo = idxProcSend;
+                    }
+
+                    // Finalize the communication
+                    if(iterRequestsSize){
+                        FLOG(waitCounter.tic());
+                        FAssertLF(iterRequestsSize <= 8);
+                        FMpi::MpiAssert(MPI_Waitall( iterRequestsSize, requestsSize, MPI_STATUSES_IGNORE), __LINE__);
+                        FLOG(waitCounter.tac());
+                    }
+
+                    if(hasToReceive){
+                        recvBuffer.cleanAndResize(recvBufferSize);
+                        FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
+                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_BYTE, idxProcToReceive,
+                                                    FMpi::TagFmmL2L + idxLevel, fcomCompute.getComm(), &requests[iterRequests++]), __LINE__ );
+                    }
+
+                    if(iterRequests){
+                        FLOG(waitCounter.tic());
+                        FAssertLF(iterRequests <= 8);
+                        FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, MPI_STATUSES_IGNORE), __LINE__);
+                        FLOG(waitCounter.tac());
+                    }
+
+                    // If we receive something proceed the L2L
+                    if(hasToReceive){
+                        FAssertLF(iterRequests != 0);
+                        // In this case we know that we have to perform the L2L with the last cell that are
+                        // exclude from our working interval nbCellsToSkip-1
+                        int hasTgtChild;
+                        recvBuffer >> hasTgtChild;
+                        FAssertLF(hasTgtChild == 0 || hasTgtChild == 1);
+                        if(hasTgtChild){
+                            iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
+                            CellClass* potentialChild[8];
+                            CellClass** const realChild   = iterArray[nbCellsToSkip-1].getCurrentChild();
+                            CellClass* const currentCell  = iterArray[nbCellsToSkip-1].getCurrentCell();
+                            int nbTgtChild = 0;
+                            for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                                if(realChild[idxChild] && realChild[idxChild]->hasTargetsChild()){
+                                    nbTgtChild += 1;
+                                    potentialChild[idxChild] = realChild[idxChild];
+                                }
+                                else{
+                                    potentialChild[idxChild] = nullptr;
+                                }
+                            }
+                            if(nbTgtChild){
+                                myThreadkernels->L2L( currentCell , potentialChild, idxLevel);
+                            }
+                        }
+                        //		      myThreadkernels->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), idxLevel);
+                    }
+                    FLOG(prepareCounter.tac());
+                }
 
 #pragma omp single nowait
-	{
-	  FLOG(computationCounter.tic());
-	}
-	// Threads are working on all the cell of our working interval at that level
+                {
+                    FLOG(computationCounter.tic());
+                }
+                // Threads are working on all the cell of our working interval at that level
 #pragma omp for nowait  schedule(dynamic, userChunkSize)
-	for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
-	  if( iterArray[idxCell].getCurrentCell()->hasTargetsChild() ){
-	    CellClass* potentialChild[8];
-	    CellClass** const realChild   = iterArray[idxCell].getCurrentChild();
-	    CellClass* const currentCell  = iterArray[idxCell].getCurrentCell();
-	    for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
-	      if(realChild[idxChild] && realChild[idxChild]->hasTargetsChild()){
-		potentialChild[idxChild] = realChild[idxChild];
-	      }
-	      else{
-		potentialChild[idxChild] = nullptr;
-	      }
-	    }
-	    myThreadkernels->L2L( currentCell , potentialChild, idxLevel);
-	    // Old form                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
-	  }
-	}
-	FLOG(computationCounter.tac());
-
-	sendBuffer.reset();
-	recvBuffer.seek(0);
-      }
-
-      delete[] requests;
-      delete[] requestsSize;
-
-      FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
-      FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
-      FLOG( FLog::Controller.flush());
+                for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
+                    if( iterArray[idxCell].getCurrentCell()->hasTargetsChild() ){
+                        CellClass* potentialChild[8];
+                        CellClass** const realChild   = iterArray[idxCell].getCurrentChild();
+                        CellClass* const currentCell  = iterArray[idxCell].getCurrentCell();
+                        int nbTgtChild = 0;
+                        for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                            if(realChild[idxChild] && realChild[idxChild]->hasTargetsChild()){
+                                potentialChild[idxChild] = realChild[idxChild];
+                                nbTgtChild += 1;
+                            }
+                            else{
+                                potentialChild[idxChild] = nullptr;
+                            }
+                        }
+                        if(nbTgtChild){
+                            myThreadkernels->L2L( currentCell , potentialChild, idxLevel);
+                        }
+                        // Old form                    myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
+                    }
+                }
+            }
+            FLOG(computationCounter.tac());
+
+            sendBuffer.reset();
+            recvBuffer.seek(0);
+        }
+        delete[] requests;
+        delete[] requestsSize;
+
+        FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << " s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
 
@@ -1308,367 +1382,373 @@ protected:
     // Direct
     /////////////////////////////////////////////////////////////////////////////
     struct LeafData{
-      FTreeCoordinate coord;
-      CellClass*      cell;
-      ContainerClass* targets;
-      ContainerClass* sources;
+        FTreeCoordinate coord;
+        CellClass*      cell;
+        ContainerClass* targets;
+        ContainerClass* sources;
     };
 
 
     /** P2P */
     void directPass(const bool p2pEnabled, const bool l2pEnabled){
-      FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
-      FLOG( FTic counterTime);
-      FLOG( FTic prepareCounter);
-      FLOG( FTic gatherCounter);
-      FLOG( FTic waitCounter);
-      FLOG(FTic computationCounter);
-      FLOG(FTic computation2Counter);
+        FLOG( FLog::Controller.write("\tStart Direct Pass\n").write(FLog::Flush); );
+        FLOG( FTic counterTime);
+        FLOG( FTic prepareCounter);
+        FLOG( FTic gatherCounter);
+        FLOG( FTic waitCounter);
+        FLOG(FTic computationCounter);
+        FLOG(FTic computation2Counter);
 
-      ///////////////////////////////////////////////////
-      // Prepare data to send receive
-      ///////////////////////////////////////////////////
-      FLOG(prepareCounter.tic());
+        ///////////////////////////////////////////////////
+        // Prepare data to send receive
+        ///////////////////////////////////////////////////
+        FLOG(prepareCounter.tic());
 
-      FBoolArray leafsNeedOther(this->numberOfLeafs);
-      int countNeedOther = 0;
+        FBoolArray leafsNeedOther(this->numberOfLeafs);
+        int countNeedOther = 0;
 
-      // To store the result
-      OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
+        // To store the result
+        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
 
-      // init
-      const int SizeShape = P2PExclusionClass::SizeShape;
+        // init
+        const int SizeShape = P2PExclusionClass::SizeShape;
 
-      int shapeLeaf[SizeShape];
-      memset(shapeLeaf,0,SizeShape*sizeof(int));
+        int shapeLeaf[SizeShape];
+        memset(shapeLeaf,0,SizeShape*sizeof(int));
 
-      LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];
+        LeafData* const leafsDataArray = new LeafData[this->numberOfLeafs];
 
-      FVector<LeafData> leafsNeedOtherData(countNeedOther);
+        FVector<LeafData> leafsNeedOtherData(countNeedOther);
 
-      FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
-      FSize partsToSend[nbProcess];
-      memset(partsToSend, 0, sizeof(FSize) * nbProcess);
+        FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
+        FSize partsToSend[nbProcess];
+        memset(partsToSend, 0, sizeof(FSize) * nbProcess);
 
 #pragma omp parallel num_threads(MaxThreads)
-      {
+        {
 #pragma omp master // MUST WAIT to fill leafsNeedOther
-	if(p2pEnabled){
-	  // Copy leafs
-	  {
-	    typename OctreeClass::Iterator octreeIterator(tree);
-	    octreeIterator.gotoBottomLeft();
-	    int idxLeaf = 0;
-	    do{
-	      this->iterArray[idxLeaf++] = octreeIterator;
-	    } while(octreeIterator.moveRight());
-	  }
-
-	  int alreadySent[nbProcess];
-
-	  //Will store the indexes of the neighbors of current cell
-	  MortonIndex indexesNeighbors[26];
-
-	  for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
-	    memset(alreadySent, 0, sizeof(int) * nbProcess);
-	    bool needOther = false;
-	    //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
-	    const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors);
-	    //Loop over the neighbor leafs
-	    for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
-	      //Test if leaf belongs to someone else (false if it's mine)
-	      if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
-		needOther = true;
-
-		// find the proc that will need current leaf
-		int procToReceive = idProcess;
-		while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].leftIndex){
-		  --procToReceive; //scroll process "before" current process
-		}
-
-		while( procToReceive != nbProcess - 1 && (intervals[procToReceive].rightIndex) < indexesNeighbors[idxNeigh]){
-		  ++procToReceive;//scroll process "after" current process
-		}
-		//  Test : Not Already Send && be sure someone hold this interval
-		if( !alreadySent[procToReceive] && intervals[procToReceive].leftIndex <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].rightIndex){
-
-		  alreadySent[procToReceive] = 1;
-		  toSend[procToReceive].push( iterArray[idxLeaf] );
-		  partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize();
-		  partsToSend[procToReceive] += int(sizeof(MortonIndex));
-		}
-	      }
-	    }
-
-	    if(needOther){ //means that something need to be sent (or received)
-	      leafsNeedOther.set(idxLeaf,true);
-	      ++countNeedOther;
-	    }
-	  }
-
-	  // No idea why it is mandatory there, could it be a few line before,
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    if(partsToSend[idxProc]){
-	      partsToSend[idxProc] += int(sizeof(FSize));
-	    }
-	  }
-	}
+            if(p2pEnabled){
+                // Copy leafs
+                {
+                    typename OctreeClass::Iterator octreeIterator(tree);
+                    octreeIterator.gotoBottomLeft();
+                    int idxLeaf = 0;
+                    do{
+                        this->iterArray[idxLeaf++] = octreeIterator;
+                    } while(octreeIterator.moveRight());
+                }
+
+                int alreadySent[nbProcess];
+
+                //Will store the indexes of the neighbors of current cell
+                MortonIndex indexesNeighbors[26];
+
+                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
+                    memset(alreadySent, 0, sizeof(int) * nbProcess);
+                    bool needOther = false;
+                    //Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
+                    const int neighCount = (iterArray[idxLeaf].getCurrentGlobalCoordinate()).getNeighborsIndexes(OctreeHeight,indexesNeighbors);
+                    //Loop over the neighbor leafs
+                    for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
+                        //Test if leaf belongs to someone else (false if it's mine)
+                        if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
+                            needOther = true;
+
+                            if(iterArray[idxLeaf].getCurrentListSrc()->getNbParticles()){
+                                // find the proc that will need current leaf
+                                int procToReceive = idProcess;
+                                while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].leftIndex){
+                                    --procToReceive; //scroll process "before" current process
+                                }
+
+                                while( procToReceive != nbProcess - 1 && (intervals[procToReceive].rightIndex) < indexesNeighbors[idxNeigh]){
+                                    ++procToReceive;//scroll process "after" current process
+                                }
+                                //  Test : Not Already Send && be sure someone hold this interval
+                                if( !alreadySent[procToReceive] && intervals[procToReceive].leftIndex <= indexesNeighbors[idxNeigh]
+                                        && indexesNeighbors[idxNeigh] <= intervals[procToReceive].rightIndex){
+                                    alreadySent[procToReceive] = 1;
+                                    toSend[procToReceive].push( iterArray[idxLeaf] );
+                                    partsToSend[procToReceive] += iterArray[idxLeaf].getCurrentListSrc()->getSavedSize();
+                                    partsToSend[procToReceive] += int(sizeof(MortonIndex));
+                                }
+                            }
+                            else{
+                                break;
+                            }
+                        }
+                    }
+
+                    if(iterArray[idxLeaf].getCurrentListTargets()->getNbParticles() && needOther){ //means that something need to be sent (or received)
+                        leafsNeedOther.set(idxLeaf,true);
+                        ++countNeedOther;
+                    }
+                }
+
+                // No idea why it is mandatory there, could it be a few line before,
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(partsToSend[idxProc]){
+                        partsToSend[idxProc] += int(sizeof(FSize));
+                    }
+                }
+            }
 
 #pragma omp barrier
 
 #pragma omp master // nowait
-	if(p2pEnabled){
-	  /* This a nbProcess x nbProcess matrix of integer
-	   * let U and V be id of processes :
-	   * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
-	   */
-	  FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
-	  memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
-
-	  //Share to all processus globalReceiveMap
-	  FLOG(gatherCounter.tic());
-	  FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend),
-					  globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), fcomCompute.getComm()),  __LINE__ );
-	  FLOG(gatherCounter.tac());
-
-	  FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
-	  memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);
-
-	  FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
-	  memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);
-
-	  // To send in asynchrone way
-	  std::vector<MPI_Request> requests;
-	  requests.reserve(2 * nbProcess);
-	  //Prepare receive
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
-	      //allocate buffer of right size
-	      recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]);
-
-	      FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(),
-			       idxProc, FMpi::TagFmmP2P, fcomCompute, &requests);
-	    }
-	  }
-
-	  // Prepare send
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    if(toSend[idxProc].getSize() != 0){
-	      sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]);
-	      // << is equivalent to write().
-	      (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
-	      for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
-		(*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
-		toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
-	      }
-
-	      FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]);
-
-	      FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(),
-			       idxProc, FMpi::TagFmmP2P, fcomCompute, &requests);
-
-	    }
-	  }
-
-	  delete[] toSend;
-
-
-	  //////////////////////////////////////////////////////////
-	  // Waitsend receive
-	  //////////////////////////////////////////////////////////
-
-	  std::unique_ptr<MPI_Status[]> status(new MPI_Status[requests.size()]);
-	  // Wait data
-	  FLOG(waitCounter.tic());
-	  MPI_Waitall(int(requests.size()), requests.data(), status.get());
-	  FLOG(waitCounter.tac());
-
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
-	      FAssertLF(recvBuffer[idxProc]);
-	      FMpiBufferReader& currentBuffer = (*recvBuffer[idxProc]);
-	      FSize nbLeaves;
-	      currentBuffer >> nbLeaves;
-	      for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
-		MortonIndex leafIndex;
-		currentBuffer >> leafIndex;
-		otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore(currentBuffer);
-	      }
-	      // Realease memory early
-	      delete recvBuffer[idxProc];
-	      recvBuffer[idxProc] = nullptr;
-	    }
-	  }
-
-	  for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-	    delete sendBuffer[idxProc];
-	    delete recvBuffer[idxProc];
-	  }
-	  delete[] globalReceiveMap;
-	}
-
-	///////////////////////////////////////////////////
-	// Prepare data for thread P2P
-	///////////////////////////////////////////////////
+            if(p2pEnabled){
+                /* This a nbProcess x nbProcess matrix of integer
+             * let U and V be id of processes :
+             * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
+             */
+                FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
+                memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
+
+                //Share to all processus globalReceiveMap
+                FLOG(gatherCounter.tic());
+                FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend),
+                                                globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), fcomCompute.getComm()),  __LINE__ );
+                FLOG(gatherCounter.tac());
+
+                FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
+                memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);
+
+                FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
+                memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);
+
+                // To send in asynchrone way
+                std::vector<MPI_Request> requests;
+                requests.reserve(2 * nbProcess);
+                //Prepare receive
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
+                        //allocate buffer of right size
+                        recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]);
+
+                        FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(),
+                                         idxProc, FMpi::TagFmmP2P, fcomCompute, &requests);
+                    }
+                }
+
+                // Prepare send
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(toSend[idxProc].getSize() != 0){
+                        sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]);
+                        // << is equivalent to write().
+                        (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
+                        for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
+                            (*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
+                            toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
+                        }
+
+                        FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]);
+
+                        FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(),
+                                         idxProc, FMpi::TagFmmP2P, fcomCompute, &requests);
+
+                    }
+                }
+
+                delete[] toSend;
+
+
+                //////////////////////////////////////////////////////////
+                // Waitsend receive
+                //////////////////////////////////////////////////////////
+
+                std::unique_ptr<MPI_Status[]> status(new MPI_Status[requests.size()]);
+                // Wait data
+                FLOG(waitCounter.tic());
+                MPI_Waitall(int(requests.size()), requests.data(), status.get());
+                FLOG(waitCounter.tac());
+
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
+                        FAssertLF(recvBuffer[idxProc]);
+                        FMpiBufferReader& currentBuffer = (*recvBuffer[idxProc]);
+                        FSize nbLeaves;
+                        currentBuffer >> nbLeaves;
+                        for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
+                            MortonIndex leafIndex;
+                            currentBuffer >> leafIndex;
+                            otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore(currentBuffer);
+                        }
+                        // Realease memory early
+                        delete recvBuffer[idxProc];
+                        recvBuffer[idxProc] = nullptr;
+                    }
+                }
+
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    delete sendBuffer[idxProc];
+                    delete recvBuffer[idxProc];
+                }
+                delete[] globalReceiveMap;
+            }
+
+            ///////////////////////////////////////////////////
+            // Prepare data for thread P2P
+            ///////////////////////////////////////////////////
 
 #pragma omp single // MUST WAIT!
-	{
-	  typename OctreeClass::Iterator octreeIterator(tree);
-	  octreeIterator.gotoBottomLeft();
+            {
+                typename OctreeClass::Iterator octreeIterator(tree);
+                octreeIterator.gotoBottomLeft();
 
-	  // to store which shape for each leaf
-	  typename OctreeClass::Iterator* const myLeafs = new typename OctreeClass::Iterator[this->numberOfLeafs];
-	  int*const shapeType = new int[this->numberOfLeafs];
+                // to store which shape for each leaf
+                typename OctreeClass::Iterator* const myLeafs = new typename OctreeClass::Iterator[this->numberOfLeafs];
+                int*const shapeType = new int[this->numberOfLeafs];
 
-	  for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
-	    myLeafs[idxLeaf] = octreeIterator;
+                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
+                    myLeafs[idxLeaf] = octreeIterator;
 
-	    const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
-	    const int shape = P2PExclusionClass::GetShapeIdx(coord);
-	    shapeType[idxLeaf] = shape;
+                    const FTreeCoordinate& coord = octreeIterator.getCurrentCell()->getCoordinate();
+                    const int shape = P2PExclusionClass::GetShapeIdx(coord);
+                    shapeType[idxLeaf] = shape;
 
-	    ++shapeLeaf[shape];
+                    ++shapeLeaf[shape];
 
-	    octreeIterator.moveRight();
-	  }
+                    octreeIterator.moveRight();
+                }
 
-	  int startPosAtShape[SizeShape];
-	  startPosAtShape[0] = 0;
-	  for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
-	    startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaf[idxShape-1];
-	  }
+                int startPosAtShape[SizeShape];
+                startPosAtShape[0] = 0;
+                for(int idxShape = 1 ; idxShape < SizeShape ; ++idxShape){
+                    startPosAtShape[idxShape] = startPosAtShape[idxShape-1] + shapeLeaf[idxShape-1];
+                }
 
-	  int idxInArray = 0;
-	  for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf, ++idxInArray){
-	    const int shapePosition = shapeType[idxInArray];
+                int idxInArray = 0;
+                for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf, ++idxInArray){
+                    const int shapePosition = shapeType[idxInArray];
 
-	    leafsDataArray[startPosAtShape[shapePosition]].coord = myLeafs[idxInArray].getCurrentGlobalCoordinate();
-	    leafsDataArray[startPosAtShape[shapePosition]].cell = myLeafs[idxInArray].getCurrentCell();
-	    leafsDataArray[startPosAtShape[shapePosition]].targets = myLeafs[idxInArray].getCurrentListTargets();
-	    leafsDataArray[startPosAtShape[shapePosition]].sources = myLeafs[idxInArray].getCurrentListSrc();
-	    if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData.push(leafsDataArray[startPosAtShape[shapePosition]]);
+                    leafsDataArray[startPosAtShape[shapePosition]].coord = myLeafs[idxInArray].getCurrentGlobalCoordinate();
+                    leafsDataArray[startPosAtShape[shapePosition]].cell = myLeafs[idxInArray].getCurrentCell();
+                    leafsDataArray[startPosAtShape[shapePosition]].targets = myLeafs[idxInArray].getCurrentListTargets();
+                    leafsDataArray[startPosAtShape[shapePosition]].sources = myLeafs[idxInArray].getCurrentListSrc();
+                    if( leafsNeedOther.get(idxLeaf) ) leafsNeedOtherData.push(leafsDataArray[startPosAtShape[shapePosition]]);
 
-	    ++startPosAtShape[shapePosition];
-	  }
+                    ++startPosAtShape[shapePosition];
+                }
 
-	  delete[] shapeType;
-	  delete[] myLeafs;
+                delete[] shapeType;
+                delete[] myLeafs;
 
-	  FLOG(prepareCounter.tac());
-	}
+                FLOG(prepareCounter.tac());
+            }
 
 
-	//////////////////////////////////////////////////////////
-	// Computation P2P that DO NOT need others data
-	//////////////////////////////////////////////////////////
+            //////////////////////////////////////////////////////////
+            // Computation P2P that DO NOT need others data
+            //////////////////////////////////////////////////////////
 
-	{
+            {
 #pragma omp single nowait
-	  {
-	    FLOG(computationCounter.tic());
-	    int previous = 0;
+                {
+                    FLOG(computationCounter.tic());
+                    int previous = 0;
 
-	    for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
-	      const int endAtThisShape = shapeLeaf[idxShape] + previous;
-	      const int chunckSize = userChunkSize;
+                    for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
+                        const int endAtThisShape = shapeLeaf[idxShape] + previous;
+                        const int chunckSize = userChunkSize;
 
-	      for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; idxLeafs += chunckSize){
-		const int nbLeavesInTask = FMath::Min(endAtThisShape-idxLeafs, chunckSize);
+                        for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; idxLeafs += chunckSize){
+                            const int nbLeavesInTask = FMath::Min(endAtThisShape-idxLeafs, chunckSize);
 #pragma omp task default(none) firstprivate(nbLeavesInTask,idxLeafs) //+shared(leafsDataArray)
-		{
-		KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
-		// There is a maximum of 26 neighbors
-		ContainerClass* neighbors[26];
-		int neighborPositions[26];
-
-		for(int idxTaskLeaf = idxLeafs ; idxTaskLeaf < (idxLeafs + nbLeavesInTask) ; ++idxTaskLeaf){
-		  LeafData& currentIter = leafsDataArray[idxTaskLeaf];
-		  if(l2pEnabled){
-		    myThreadkernels->L2P(currentIter.cell, currentIter.targets);
-		  }
-		  if(p2pEnabled){
-		    // need the current particles and neighbors particles
-		    const int counter = tree->getLeafsNeighbors(neighbors, neighborPositions, currentIter.coord, OctreeHeight-1);
-		    myThreadkernels->P2P( currentIter.coord,currentIter.targets,
-					  currentIter.sources, neighbors, neighborPositions, counter);
-		  }
-		}
-	      }
-	    }
-	      previous = endAtThisShape;
+                            {
+                                KernelClass* myThreadkernels = (kernels[omp_get_thread_num()]);
+                                // There is a maximum of 26 neighbors
+                                ContainerClass* neighbors[26];
+                                int neighborPositions[26];
+
+                                for(int idxTaskLeaf = idxLeafs ; idxTaskLeaf < (idxLeafs + nbLeavesInTask) ; ++idxTaskLeaf){
+                                    LeafData& currentIter = leafsDataArray[idxTaskLeaf];
+                                    if(l2pEnabled){
+                                        myThreadkernels->L2P(currentIter.cell, currentIter.targets);
+                                    }
+                                    if(p2pEnabled){
+                                        // need the current particles and neighbors particles
+                                        const int counter = tree->getLeafsNeighbors(neighbors, neighborPositions, currentIter.coord, OctreeHeight-1);
+                                        myThreadkernels->P2P( currentIter.coord,currentIter.targets,
+                                                              currentIter.sources, neighbors, neighborPositions, counter);
+                                    }
+                                }
+                            }
+                        }
+                        previous = endAtThisShape;
 
 #pragma omp taskwait
-	  }
-	    FLOG(computationCounter.tac());
-	}
-      }
+                    }
+                    FLOG(computationCounter.tac());
+                }
+            }
 
-	// Wait the come to finish (and the previous computation also)
+            // Wait the come to finish (and the previous computation also)
 #pragma omp barrier
 
 
-	//////////////////////////////////////////////////////////
-	// Computation P2P that need others data
-	//////////////////////////////////////////////////////////
+            //////////////////////////////////////////////////////////
+            // Computation P2P that need others data
+            //////////////////////////////////////////////////////////
 #pragma omp master
-	{ FLOG( computation2Counter.tic() ); }
-
-	if(p2pEnabled){
-	  KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
-	  // There is a maximum of 26 neighbors
-	  ContainerClass* neighbors[26];
-	  MortonIndex indexesNeighbors[26];
-	  int indexArray[26];
-	  int neighborPositions[26];
-	  // Box limite
-	  FAssertLF(leafsNeedOtherData.getSize() < std::numeric_limits<int>::max());
-	  const int nbLeafToProceed = int(leafsNeedOtherData.getSize());
+            { FLOG( computation2Counter.tic() ); }
+
+            if(p2pEnabled){
+                KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
+                // There is a maximum of 26 neighbors
+                ContainerClass* neighbors[26];
+                MortonIndex indexesNeighbors[26];
+                int indexArray[26];
+                int neighborPositions[26];
+                // Box limite
+                FAssertLF(leafsNeedOtherData.getSize() < std::numeric_limits<int>::max());
+                const int nbLeafToProceed = int(leafsNeedOtherData.getSize());
 
 #pragma omp for  schedule(dynamic, userChunkSize)
-	  for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){
-	    LeafData currentIter = leafsNeedOtherData[idxLeafs];
-
-	    // need the current particles and neighbors particles
-	    int counter = 0;
-
-	    // Take possible data
-	    const int nbNeigh = currentIter.coord.getNeighborsIndexes(OctreeHeight, indexesNeighbors, indexArray);
-
-	    for(int idxNeigh = 0 ; idxNeigh < nbNeigh ; ++idxNeigh){
-	      if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
-		ContainerClass*const hypotheticNeighbor = otherP2Ptree.getLeafSrc(indexesNeighbors[idxNeigh]);
-		if(hypotheticNeighbor){
-		  neighbors[ counter ] = hypotheticNeighbor;
-		  neighborPositions[counter] = indexArray[idxNeigh];
-		  ++counter;
-		}
-	      }
-	    }
-	    if(counter){
-	      myThreadkernels.P2PRemote( currentIter.cell->getCoordinate(), currentIter.targets,
-					 currentIter.sources, neighbors, neighborPositions, counter);
-	    }
-
-	  }
-
-	}
-    }
+                for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){
+                    LeafData currentIter = leafsNeedOtherData[idxLeafs];
+                    FAssertLF(currentIter.targets->getNbParticles())
 
-      delete[] leafsDataArray;
+                            // need the current particles and neighbors particles
+                            int counter = 0;
 
-      FLOG(computation2Counter.tac());
+                    // Take possible data
+                    const int nbNeigh = currentIter.coord.getNeighborsIndexes(OctreeHeight, indexesNeighbors, indexArray);
 
+                    for(int idxNeigh = 0 ; idxNeigh < nbNeigh ; ++idxNeigh){
+                        if(indexesNeighbors[idxNeigh] < (intervals[idProcess].leftIndex) || (intervals[idProcess].rightIndex) < indexesNeighbors[idxNeigh]){
+                            ContainerClass*const hypotheticNeighbor = otherP2Ptree.getLeafSrc(indexesNeighbors[idxNeigh]);
+                            if(hypotheticNeighbor){
+                                neighbors[ counter ] = hypotheticNeighbor;
+                                neighborPositions[counter] = indexArray[idxNeigh];
+                                ++counter;
+                            }
+                        }
+                    }
+                    if(counter){
+                        myThreadkernels.P2PRemote( currentIter.cell->getCoordinate(), currentIter.targets,
+                                                   currentIter.sources, neighbors, neighborPositions, counter);
+                    }
 
-      FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = "  << counterTime.tacAndElapsed() << " s)\n" );
-      FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.elapsed() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Computation P2P 2 : " << computation2Counter.elapsed() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" );
-      FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" );
-      FLOG( FLog::Controller.flush());
+                }
 
-  }
-    };
+            }
+        }
+
+        delete[] leafsDataArray;
+
+        FLOG(computation2Counter.tac());
+
+
+        FLOG( FLog::Controller << "\tFinished (@Direct Pass (L2P + P2P) = "  << counterTime.tacAndElapsed() << " s)\n" );
+        FLOG( FLog::Controller << "\t\t Computation L2P + P2P : " << computationCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Computation P2P 2 : " << computation2Counter.elapsed() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller.flush());
+
+    }
+};
 
 
 
diff --git a/Src/Core/FFmmAlgorithmThreadTsm.hpp b/Src/Core/FFmmAlgorithmThreadTsm.hpp
index 35343de76..2ae666958 100644
--- a/Src/Core/FFmmAlgorithmThreadTsm.hpp
+++ b/Src/Core/FFmmAlgorithmThreadTsm.hpp
@@ -213,11 +213,12 @@ protected:
                     CellClass* potentialChild[8];
                     CellClass** const realChild = iterArray[idxCell].getCurrentChild();
                     CellClass* const currentCell = iterArray[idxCell].getCurrentCell();
+                    int nbChildWithSrc = 0;
                     for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
                         potentialChild[idxChild] = nullptr;
                         if(realChild[idxChild]){
                             if(realChild[idxChild]->hasSrcChild()){
-                                currentCell->setSrcChildTrue();
+                                nbChildWithSrc += 1;
                                 potentialChild[idxChild] = realChild[idxChild];
                             }
                             if(realChild[idxChild]->hasTargetsChild()){
@@ -225,7 +226,10 @@ protected:
                             }
                         }
                     }
-                    myThreadkernels->M2M( currentCell , potentialChild, idxLevel);
+                    if(nbChildWithSrc){
+                        currentCell->setSrcChildTrue();
+                        myThreadkernels->M2M( currentCell , potentialChild, idxLevel);
+                    }
                 }
             }
             FLOG(computationCounter.tac());
diff --git a/Tests/Utils/testFmmAlgorithmProcTsm.cpp b/Tests/Utils/testFmmAlgorithmProcTsm.cpp
new file mode 100644
index 000000000..864a12a41
--- /dev/null
+++ b/Tests/Utils/testFmmAlgorithmProcTsm.cpp
@@ -0,0 +1,505 @@
+// ===================================================================================
+// Copyright ScalFmm 2016 INRIA, Olivier Coulaud, Bérenger Bramas,
+// Matthias Messner olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the
+// FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+// An extension to the license is given to allow static linking of scalfmm
+// inside a proprietary application (no matter its license).
+// See the main license file for more details.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+
+// ==== CMAKE =====
+// @FUSE_MPI
+// ================
+
+#include "../../Src/Utils/FMpi.hpp"
+#include "../../Src/Utils/FTic.hpp"
+
+#include "../../Src/Containers/FOctree.hpp"
+#include "../../Src/Containers/FVector.hpp"
+#include "../../Src/Utils/FParameters.hpp"
+#include "../../Src/Utils/FGlobal.hpp"
+
+#include "../../Src/Components/FSimpleLeaf.hpp"
+
+#include "../../Src/Utils/FPoint.hpp"
+
+#include "../../Src/Components/FTestCell.hpp"
+#include "../../Src/Components/FTestKernels.hpp"
+#include "../../Src/Components/FTestParticleContainer.hpp"
+
+//#include "../../Src/Core/FFmmAlgorithmProcMpi.hpp"
+#include "../../Src/Core/FFmmAlgorithmThreadProcTsm.hpp"
+#include "../../Src/Core/FFmmAlgorithmThreadTsm.hpp"
+
+#include "../../Src/Files/FRandomLoader.hpp"
+#include "../../Src/Files/FMpiTreeBuilder.hpp"
+
+#include "../../Src/Components/FBasicKernels.hpp"
+
+#include "../../Src/Utils/FLeafBalance.hpp"
+
+#include "../../Src/Utils/FParameterNames.hpp"
+#include "../../Src/Extensions/FExtendCellType.hpp"
+
+#include <iostream>
+#include <cstdio>
+#include <cstdlib>
+
+
+/** This program show an example of use of the fmm threaded + mpi algo
+ * it also check that each particles is impacted each other particles
+ */
+
+/////////////////////////////////////////////////////////////////////////////
+// Test function
+/////////////////////////////////////////////////////////////////////////////
+
+// Check if tree is built correctly
+template<class OctreeClass>
+void ValidateTree(OctreeClass& realTree,
+                  OctreeClass& treeValide){
+
+    typename OctreeClass::Iterator octreeIteratorValide(&treeValide);
+    octreeIteratorValide.gotoBottomLeft();
+
+    typename OctreeClass::Iterator octreeIterator(&realTree);
+    octreeIterator.gotoBottomLeft();
+
+    while(octreeIteratorValide.getCurrentGlobalIndex() != octreeIterator.getCurrentGlobalIndex()){
+        if(octreeIteratorValide.moveRight() == false){
+            std::cout << "Error the real tree smaller than the parallel one\n";
+            std::cout << "Valide tree stop at " << octreeIteratorValide.getCurrentGlobalIndex() << "\n";
+            std::cout << "Other at " << octreeIterator.getCurrentGlobalIndex() << "\n";
+            return;
+        }
+    }
+
+    std::cout << "The tree starts at " << octreeIteratorValide.getCurrentGlobalIndex() << "\n";
+
+    while(true){
+        if(octreeIteratorValide.getCurrentGlobalIndex() != octreeIterator.getCurrentGlobalIndex()){
+            std::cout << "Error the trees do not have the same idx.\n";
+            std::cout << "Correct one is " << octreeIteratorValide.getCurrentGlobalIndex() << "\n";
+            std::cout << "Incorrect one is " << octreeIterator.getCurrentGlobalIndex() << "\n";
+            return;
+        }
+
+        if(octreeIteratorValide.getCurrentListSrc()->getNbParticles() != octreeIterator.getCurrentListSrc()->getNbParticles()){
+            std::cout << "Error the trees do not have the same number of particles at idx " << octreeIteratorValide.getCurrentGlobalIndex() << ".\n";
+            std::cout << "Correct one is " << octreeIteratorValide.getCurrentListSrc()->getNbParticles() << "\n";
+            std::cout << "Incorrect one is " << octreeIterator.getCurrentListSrc()->getNbParticles() << "\n";
+            return;
+        }
+
+        if(octreeIterator.moveRight() == false){
+            break;
+        }
+
+        if(octreeIteratorValide.moveRight() == false){
+            std::cout << "Error the real tree smaller than the parallel one\n";
+        }
+    }
+
+    std::cout << "The tree stops at " << octreeIteratorValide.getCurrentGlobalIndex() << "\n";
+}
+
+
+
+/** This function tests the octree to be sure that the fmm algorithm
+ * has worked completly.
+ */
+template<class OctreeClass, class ContainerClass, class FmmClassProc>
+void ValidateFMMAlgoProc(OctreeClass* const badTree,
+                         OctreeClass* const valideTree,
+                         FmmClassProc* const fmm){
+    std::cout << "The working interval is from " << fmm->getWorkingInterval(badTree->getHeight()-1).leftIndex << "\n";
+    std::cout << "The working interval is to " << fmm->getWorkingInterval(badTree->getHeight()-1).rightIndex << "\n";
+    std::cout << "\tValidate L2L M2M...\n";
+    const int OctreeHeight = badTree->getHeight();
+    {
+        typename OctreeClass::Iterator octreeIterator(badTree);
+        octreeIterator.gotoBottomLeft();
+
+        typename OctreeClass::Iterator octreeIteratorValide(valideTree);
+        octreeIteratorValide.gotoBottomLeft();
+
+        for(int level = OctreeHeight - 1 ; level > 0 && fmm->hasWorkAtLevel(level) ; --level){
+
+            while(octreeIteratorValide.getCurrentGlobalIndex() != octreeIterator.getCurrentGlobalIndex()) {
+                octreeIteratorValide.moveRight();
+            }
+
+            while(octreeIteratorValide.getCurrentGlobalIndex() != fmm->getWorkingInterval(level).leftIndex){
+                octreeIteratorValide.moveRight();
+                octreeIterator.moveRight();
+            }
+
+            FSize countCheck = 0;
+            do{
+                if(octreeIterator.getCurrentGlobalIndex() != octreeIteratorValide.getCurrentGlobalIndex()){
+                    std::cout << "Problem Error index are not equal! " << octreeIterator.getCurrentGlobalIndex() << " " << octreeIteratorValide.getCurrentGlobalIndex() << std::endl;
+                }
+                else{
+                    if(octreeIterator.getCurrentCell()->getDataUp() != octreeIteratorValide.getCurrentCell()->getDataUp()){
+                        std::cout << "Problem M2M error at level " << level << " up bad " << octreeIterator.getCurrentCell()->getDataUp()
+                                  << " good " << octreeIteratorValide.getCurrentCell()->getDataUp() << " index " << octreeIterator.getCurrentGlobalIndex() << std::endl;
+                    }
+                    if(octreeIterator.getCurrentCell()->getDataDown() != octreeIteratorValide.getCurrentCell()->getDataDown()){
+                        std::cout << "Problem L2L error at level " << level << " down bad " << octreeIterator.getCurrentCell()->getDataDown()
+                                  << " good " << octreeIteratorValide.getCurrentCell()->getDataDown() << " index " << octreeIterator.getCurrentGlobalIndex() << std::endl;
+                    }
+                }
+                ++countCheck;
+            } while(octreeIteratorValide.moveRight() && octreeIterator.moveRight());
+
+            // Check that each particle has been summed with all other
+
+            octreeIterator.moveUp();
+            octreeIterator.gotoLeft();
+
+            octreeIteratorValide.moveUp();
+            octreeIteratorValide.gotoLeft();
+        }
+    }
+    std::cout << "\tValidate L2P P2P...\n";
+    {
+        FSize NbPart = 0;
+        FSize NbLeafs = 0;
+        { // Check that each particle has been summed with all other
+            typename OctreeClass::Iterator octreeIterator(valideTree);
+            octreeIterator.gotoBottomLeft();
+            do{
+                NbPart += octreeIterator.getCurrentListSrc()->getNbParticles();
+                ++NbLeafs;
+            } while(octreeIterator.moveRight());
+        }
+        {
+            // Check that each particle has been summed with all other
+            typename OctreeClass::Iterator octreeIterator(badTree);
+            octreeIterator.gotoBottomLeft();
+
+            do {
+                const bool isUsingTsm = (octreeIterator.getCurrentListTargets() != octreeIterator.getCurrentListSrc());
+
+                ContainerClass* container = (octreeIterator.getCurrentListTargets());
+                const long long int*const dataDown = container->getDataDown();
+
+                for(FSize idxPart = 0 ; idxPart < container->getNbParticles() ; ++idxPart){
+                    // If a particles has been impacted by less than NbPart - 1 (the current particle)
+                    // there is a problem
+                    if( (!isUsingTsm && dataDown[idxPart] != NbPart - 1) ||
+                            (isUsingTsm && dataDown[idxPart] != NbPart) ){
+                        std::cout << "Problem L2P + P2P, value on particle is : " << dataDown[idxPart] <<
+                                     " at pos " << idxPart << " index is " << octreeIterator.getCurrentGlobalIndex() << "\n";
+                    }
+                }
+            } while( octreeIterator.moveRight());
+        }
+    }
+    std::cout << "\tValidate P2M...\n";
+    {
+        {
+            // Check that each particle has been summed with all other
+            typename OctreeClass::Iterator octreeIterator(badTree);
+            octreeIterator.gotoBottomLeft();
+
+            do {
+                if(octreeIterator.getCurrentListSrc()->getNbParticles() != octreeIterator.getCurrentCell()->getDataUp()){
+                    printf("P2M Problem nb part %lld data up %lld \n",
+                           octreeIterator.getCurrentListSrc()->getNbParticles(), octreeIterator.getCurrentCell()->getDataUp());
+                }
+            } while( octreeIterator.moveRight() );
+        }
+    }
+    std::cout << "\tValidate Particles...\n";
+    {
+        // Check that each particle has been summed with all other
+        typename OctreeClass::Iterator octreeIterator(badTree);
+        octreeIterator.gotoBottomLeft();
+
+        typename OctreeClass::Iterator valideOctreeIterator(valideTree);
+        valideOctreeIterator.gotoBottomLeft();
+        while(valideOctreeIterator.getCurrentGlobalIndex() != octreeIterator.getCurrentGlobalIndex()){
+            valideOctreeIterator.moveRight();
+        }
+
+        do {
+            if(valideOctreeIterator.getCurrentGlobalIndex() != octreeIterator.getCurrentGlobalIndex()){
+                printf("Problem Do not have the same index valide %lld invalide %lld \n",
+                       valideOctreeIterator.getCurrentGlobalIndex(), octreeIterator.getCurrentGlobalIndex());
+                break;
+            }
+
+            if(octreeIterator.getCurrentListTargets()->getNbParticles() != valideOctreeIterator.getCurrentListTargets()->getNbParticles()){
+                printf("Problem Do not have the same number of particle at leaf id %lld, valide %lld invalide %lld \n",
+                       octreeIterator.getCurrentGlobalIndex(), valideOctreeIterator.getCurrentListTargets()->getNbParticles(),
+                       octreeIterator.getCurrentListTargets()->getNbParticles());
+            }
+            else {
+                ContainerClass* container = (octreeIterator.getCurrentListTargets());
+                const long long int*const dataDown = container->getDataDown();
+
+                ContainerClass* containerValide = (valideOctreeIterator.getCurrentListTargets());
+                const long long int*const dataDownValide = containerValide->getDataDown();
+
+                for(FSize idxPart = 0 ; idxPart < container->getNbParticles() ; ++idxPart){
+                    // If a particles has been impacted by less than NbPart - 1 (the current particle)
+                    // there is a problem
+                    if( dataDown[idxPart] != dataDownValide[idxPart]){
+                        std::cout << "Problem on leaf " << octreeIterator.getCurrentGlobalIndex() <<
+                                     " part " << idxPart << " valide data down " << dataDownValide[idxPart] <<
+                                     " invalide " << dataDown[idxPart] << "\n";
+                        std::cout << "Data down for leaf is: valide " << valideOctreeIterator.getCurrentCell()->getDataDown()
+                                  << " invalide " << octreeIterator.getCurrentCell()->getDataDown()
+                                  << " size is: valide " <<  valideOctreeIterator.getCurrentListTargets()->getNbParticles()
+                                  << " invalide " << octreeIterator.getCurrentListTargets()->getNbParticles() << std::endl;
+                    }
+                }
+            }
+
+        }while( octreeIterator.moveRight() && valideOctreeIterator.moveRight());
+    }
+    std::cout << "\tDone!\n";
+}
+
+
+/** To print an octree
+ * used to debug and understand how the values were passed
+ */
+template<class OctreeClass>
+void print(OctreeClass* const valideTree){
+    typename OctreeClass::Iterator octreeIterator(valideTree);
+    for(int idxLevel = valideTree->getHeight() - 1 ; idxLevel > 1 ; --idxLevel ){
+        do{
+            std::cout << "[" << octreeIterator.getCurrentGlobalIndex() << "] up:" << octreeIterator.getCurrentCell()->getDataUp() << " down:" << octreeIterator.getCurrentCell()->getDataDown() << "\t";
+        } while(octreeIterator.moveRight());
+        std::cout << "\n";
+        octreeIterator.gotoLeft();
+        octreeIterator.moveDown();
+    }
+}
+
+
+
+/////////////////////////////////////////////////////////////////////
+// Main
+/////////////////////////////////////////////////////////////////////
+
+class FTestCellTsm: public FTestCell , public FExtendCellType{
+};
+
+// Simply create particles and try the kernels
+int main(int argc, char ** argv){
+    /////////////////////////////////////////////////////////////////////
+    // Define the classes to use
+    /////////////////////////////////////////////////////////////////////
+
+    typedef double FReal;
+
+    typedef FTestCellTsm                  CellClass;
+    typedef FTestParticleContainer<FReal>     ContainerClass;
+
+    typedef FSimpleLeaf<FReal, ContainerClass >                     LeafClass;
+    typedef FOctree<FReal, CellClass, ContainerClass , LeafClass >  OctreeClass;
+    typedef FTestKernels< CellClass, ContainerClass >         KernelClass;
+
+    typedef FFmmAlgorithmThreadTsm<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass >     FmmClass;
+    typedef FFmmAlgorithmThreadProcTsm<OctreeClass, CellClass, ContainerClass, KernelClass, LeafClass >     FmmClassProc;
+
+    FHelpDescribeAndExit(argc, argv,
+                         "Test FMM distributed algorithm by counting the nb of interactions each particle receive.",
+                         FParameterDefinitions::OctreeHeight, FParameterDefinitions::OctreeSubHeight,
+                         FParameterDefinitions::InputFile);
+    ///////////////////////What we do/////////////////////////////
+    std::cout << ">> This executable has to be used to test the FMM algorithm.\n";
+    //////////////////////////////////////////////////////////////
+
+    FMpi app( argc, argv);
+
+    const int NbLevels = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5);
+    const int SizeSubLevels = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeSubHeight.options, 3);
+    FTic counter;
+    const FSize nbParticles = 100000;
+    FRandomLoader<FReal> loader(nbParticles, 1.0, FPoint<FReal>(0,0,0), nbParticles);
+    if(!loader.isOpen()) throw std::runtime_error("Particle file couldn't be opened!");
+
+    std::vector<FPoint<FReal>> allParticles(nbParticles);
+    for(int idxPart = 0 ; idxPart < nbParticles ; ++idxPart){
+        loader.fillParticle(&allParticles[idxPart]);
+    }
+
+    const double chunckSize = double(nbParticles+app.global().processCount()-1)/double(app.global().processCount());
+    const FSize offsetParticles = chunckSize*double(app.global().processId());
+    const FSize nbPartsForMe = std::min(nbParticles, FSize(chunckSize*double(app.global().processId()+1)))
+                                - offsetParticles;
+    const FReal boxWidth = loader.getBoxWidth();
+    const FPoint<FReal> centerOfBox = loader.getCenterOfBox();
+
+    std::cout << "Simulation properties :\n";
+    std::cout << "Nb Particles For me " << nbPartsForMe << "\n";
+    std::cout << "Box Width : " << boxWidth << "\n";
+    std::cout << "Box Center : " << centerOfBox << "\n";
+    std::cout << "offsetParticles " << offsetParticles << ":\n";
+    std::cout << "nbPartsForMe " << nbPartsForMe << ":\n";
+
+    // The real tree to work on
+    OctreeClass realTree(NbLevels, SizeSubLevels,boxWidth,centerOfBox);
+
+    FSize totalNbSources = 0;
+    FSize totalNbTargets = 0;
+
+    if( app.global().processCount() != 1){
+        //////////////////////////////////////////////////////////////////////////////////
+        // Build tree from mpi loader
+        //////////////////////////////////////////////////////////////////////////////////
+        std::cout << "Build Tree ..." << std::endl;
+        counter.tic();
+
+        struct TestParticle{
+            FPoint<FReal> position;
+            const FPoint<FReal>& getPosition(){
+                return position;
+            }
+            bool isSource;
+        };
+
+        TestParticle* particles = new TestParticle[nbPartsForMe];
+        memset(particles, 0, sizeof(TestParticle) * nbPartsForMe);
+        FSize nbSources = 0;
+        FSize nbTargets = 0;
+        for(FSize idxPart = 0 ; idxPart < nbPartsForMe ; ++idxPart){
+            FPoint<FReal> position = allParticles[idxPart + offsetParticles];
+            particles[idxPart].position = position;
+            particles[idxPart].isSource = ((idxPart + offsetParticles)&1);
+            if(particles[idxPart].isSource){
+                nbSources += 1;
+            }
+            else{
+                nbTargets += 1;
+            }
+        }
+
+
+        std::cout << "I init " << nbPartsForMe << " particles\n";
+        std::cout << "I init " << nbSources << " sources\n";
+        std::cout << "I init " << nbTargets << " sources\n";
+
+        totalNbSources = app.global().allReduceSum(nbSources);
+        totalNbTargets = app.global().allReduceSum(nbTargets);
+
+        std::cout << "totalNbSources " << totalNbSources << " sources\n";
+        std::cout << "totalNbTargets " << totalNbTargets << " sources\n";
+
+        FVector<TestParticle> finalParticles;
+        FLeafBalance balancer;
+        FMpiTreeBuilder< FReal,TestParticle >::DistributeArrayToContainer(app.global(),particles,
+                                                                    nbPartsForMe,
+                                                                    realTree.getBoxCenter(),
+                                                                    realTree.getBoxWidth(),realTree.getHeight(),
+                                                                    &finalParticles, &balancer);
+        std::cout << "I have now " << finalParticles.getSize() << " particles\n";
+
+        for(int idx = 0 ; idx < finalParticles.getSize(); ++idx){
+            realTree.insert(finalParticles[idx].position,
+                            finalParticles[idx].isSource? FParticleType::FParticleTypeSource: FParticleType::FParticleTypeTarget);
+        }
+
+        delete[] particles;
+
+        counter.tac();
+        std::cout << "Done  " << "(" << counter.elapsed() << "s)." << std::endl;
+
+        //////////////////////////////////////////////////////////////////////////////////
+    }
+    else{
+        FAssertLF(offsetParticles == 0);
+        for(FSize idxPart = 0 ; idxPart < nbPartsForMe ; ++idxPart){
+            const bool isSource = idxPart&1;
+            realTree.insert(allParticles[idxPart],
+                            isSource? FParticleType::FParticleTypeSource: FParticleType::FParticleTypeTarget);
+            if(isSource){
+                totalNbSources += 1;
+            }
+            else{
+                totalNbTargets += 1;
+            }
+        }
+    }
+    FAssertLF(totalNbSources + totalNbTargets == loader.getNumberOfParticles(),
+              totalNbSources, "+", totalNbTargets, "==", loader.getNumberOfParticles());
+
+    //////////////////////////////////////////////////////////////////////////////////
+    // Create real tree
+    //////////////////////////////////////////////////////////////////////////////////
+
+    OctreeClass treeValide(NbLevels, SizeSubLevels,boxWidth,centerOfBox);
+    {
+        for(FSize idxPart = 0 ; idxPart < nbParticles ; ++idxPart){
+            bool isSource = idxPart&1;
+            treeValide.insert(allParticles[idxPart],
+                              isSource? FParticleType::FParticleTypeSource: FParticleType::FParticleTypeTarget);
+        }
+    }
+
+    //////////////////////////////////////////////////////////////////////////////////
+    // Check particles in tree
+    //////////////////////////////////////////////////////////////////////////////////
+    std::cout << "Validate tree ..." << std::endl;
+    counter.tic();
+
+    ValidateTree(realTree, treeValide);
+
+    counter.tac();
+    std::cout << "Done  " << "(" << counter.elapsed() << "s)." << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////////////
+
+    std::cout << "Working parallel particles ..." << std::endl;
+    counter.tic();
+
+    KernelClass kernels;
+
+    FmmClassProc algo(app.global(),&realTree,&kernels);
+    algo.execute();
+
+    counter.tac();
+    std::cout << "Done  " << "(@Algorithm Particles = " << counter.elapsed() << "s)." << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////////////
+
+    std::cout << "Working sequential particles ..." << std::endl;
+    counter.tic();
+
+    FmmClass algoValide(&treeValide,&kernels);
+    algoValide.execute();
+
+    counter.tac();
+    std::cout << "Done  " << "(@Algorithm Particles = " << counter.elapsed() << "s)." << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////////////
+
+    std::cout << "Checking data ..." << std::endl;
+    counter.tic();
+
+    ValidateFMMAlgoProc<OctreeClass,ContainerClass, FmmClassProc>(&realTree,&treeValide,&algo);
+
+    counter.tac();
+    std::cout << "Done  " << "(" << counter.elapsed() << "s)." << std::endl;
+
+    //////////////////////////////////////////////////////////////////////////////////
+    //////////////////////////////////////////////////////////////////////////////////
+
+    return 0;
+}
-- 
GitLab