diff --git a/Src/Containers/FBufferVector.hpp b/Src/Containers/FBufferVector.hpp
deleted file mode 100644
index c5f7e53b9801c618b4c3db0c3f1a4edf071916ef..0000000000000000000000000000000000000000
--- a/Src/Containers/FBufferVector.hpp
+++ /dev/null
@@ -1,99 +0,0 @@
-#ifndef FBUFFERVECTOR_HPP
-#define FBUFFERVECTOR_HPP
-// /!\ Please, you must read the license at the bottom of this page
-
-#include "../Utils/FAssertable.hpp"
-// To get memcpy
-#include <cstring>
-
-/**
-* @author Berenger Bramas (berenger.bramas@inria.fr)
-* @class FBufferVector
-* Please read the license
-* This class is a buffer to send big data via MPI.
-* It has a capacity and alloc an array only if needed.
-* You have to be sure that the buffer is not full before adding any data.
-* This can be done by checking :
-* <code>
-* if(!sendBuffer.addEnoughSpace(object->bytesToSendUp())){<br>
-*    app.sendData(idxReceiver,sendBuffer.getSize(),sendBuffer.getData(),globalTag);<br>
-*    sendBuffer.clear();<br>
-* }<br>
-* sendBuffer.addDataUp(tag,object);<br>
-* </code>
-*/
-template <int Capacity>
-        class FBufferVector : protected FAssertable {
-    // the Buffer (can be null)
-    char* buffer;
-    // current ocupped size
-    int occuped;
-
-    // check if buffer is allocated if not does it
-    void checkBuffer(){
-        if(!buffer){
-            buffer = new char[Capacity];
-        }
-    }
-
-public:
-    // Constructor, set buffer to null
-    FBufferVector() : buffer(0), occuped(0) {
-        fassert(Capacity > 0, "Capacity has to be positive", __LINE__, __FILE__);
-    }
-
-    // Dealloc buffer if needed
-    ~FBufferVector(){
-        if(buffer) delete(buffer);
-    }
-
-    // To check if there is enough free space
-    bool addEnoughSpace(const int neededSpace) const{
-        return occuped + neededSpace + sizeof(int) <= Capacity;
-    }
-
-    // get the capacity of the buffer
-    int getCapacity() const{
-        return Capacity;
-    }
-
-    // get the current size
-    int getSize(){
-        return occuped;
-    }
-
-    // get a pointer to the buffer
-    void* getData() {
-        return buffer;
-    }
-
-    // ask to an object to write himself into the buffer
-    // and update occuped space
-    template < class T >
-    void addDataUp(const int tag, const T& object){
-        checkBuffer();
-        memcpy(&buffer[occuped],&tag,sizeof(int));
-        occuped += sizeof(int);
-        occuped += object.writeUp(&buffer[occuped], Capacity - occuped);
-    }
-
-    // ask to an object to write himself into the buffer
-    // and update occuped space
-    template < class T >
-    void addDataDown(const int tag, const T& object){
-        checkBuffer();
-        memcpy(&buffer[occuped],&tag,sizeof(int));
-        occuped += sizeof(int);
-        occuped += object.writeDown(&buffer[occuped], Capacity - occuped);
-    }
-
-    // reset occuped memory
-    void clear(){
-        occuped = 0;
-    }
-};
-
-
-#endif //FBUFFERVECTOR_HPP
-
-// [--LICENSE--]
diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp
index 64a979bed126d1997a2bbe1ebaa7aa7952f04036..cafc373cb08d36dbaa2494d4e2b01a1c3edd7c84 100644
--- a/Src/Core/FFmmAlgorithmThreadProc.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProc.hpp
@@ -10,7 +10,6 @@
 
 #include "../Containers/FBoolArray.hpp"
 #include "../Containers/FOctree.hpp"
-#include "../Containers/FBufferVector.hpp"
 
 #include "../Utils/FMpi.hpp"
 
@@ -59,8 +58,8 @@ class FFmmAlgorithmThreadProc : protected FAssertable {
         MortonIndex max;
     };
     Interval*const intervals;
-    Interval*const intervalsPerLevel;
     Interval*const realIntervalsPerLevel;
+    Interval*const workingIntervalsPerLevel;
 
 
     static void mpiassert(const int test, const unsigned line, const char* const message = 0){
@@ -87,8 +86,8 @@ public:
             : app(inApp), tree(inTree) , kernels(0), numberOfLeafs(0),
             MaxThreads(omp_get_max_threads()), nbProcess(inApp.processCount()), idProcess(inApp.processId()),
             OctreeHeight(tree->getHeight()),intervals(new Interval[inApp.processCount()]),
-            intervalsPerLevel(new Interval[inApp.processCount() * tree->getHeight()]),
-            realIntervalsPerLevel(new Interval[inApp.processCount() * tree->getHeight()]){
+            realIntervalsPerLevel(new Interval[inApp.processCount() * tree->getHeight()]),
+            workingIntervalsPerLevel(new Interval[inApp.processCount() * tree->getHeight()]){
 
         fassert(tree, "tree cannot be null", __LINE__, __FILE__);
 
@@ -109,8 +108,8 @@ public:
         delete [] this->kernels;
 
         delete [] intervals;
-        delete [] intervalsPerLevel;
         delete [] realIntervalsPerLevel;
+        delete [] workingIntervalsPerLevel;
     }
 
     /**
@@ -137,27 +136,23 @@ public:
         for(int idxLevel = 0 ; idxLevel < OctreeHeight ; ++idxLevel){
             const int offset = idxLevel * nbProcess;
             for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-                intervalsPerLevel[offset + idxProc].max = intervals[idxProc].max >> (3 * (OctreeHeight - idxLevel - 1));
-                intervalsPerLevel[offset + idxProc].min = intervals[idxProc].min >> (3 * (OctreeHeight - idxLevel - 1));
+                realIntervalsPerLevel[offset + idxProc].max = intervals[idxProc].max >> (3 * (OctreeHeight - idxLevel - 1));
+                realIntervalsPerLevel[offset + idxProc].min = intervals[idxProc].min >> (3 * (OctreeHeight - idxLevel - 1));
             }
 
-            realIntervalsPerLevel[offset + 0] = intervalsPerLevel[offset + 0];
+            workingIntervalsPerLevel[offset + 0] = realIntervalsPerLevel[offset + 0];
             for(int idxProc = 1 ; idxProc < nbProcess ; ++idxProc){
-                realIntervalsPerLevel[offset + idxProc].min = FMath::Max( intervalsPerLevel[offset + idxProc].min,
-                                                                          intervalsPerLevel[offset + idxProc - 1].max + 1);
-                realIntervalsPerLevel[offset + idxProc].max = intervalsPerLevel[offset + idxProc].max;
+                workingIntervalsPerLevel[offset + idxProc].min = FMath::Max( realIntervalsPerLevel[offset + idxProc].min,
+                                                                          realIntervalsPerLevel[offset + idxProc - 1].max + 1);
+                workingIntervalsPerLevel[offset + idxProc].max = realIntervalsPerLevel[offset + idxProc].max;
             }
         }
 
         // run;
-        preP2P();
-
         bottomPass();
 
         upwardPass();
 
-        return;
-
         downardPass();
 
         directPass();
@@ -169,149 +164,6 @@ public:
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
-    void preP2P(){
-        // Copy leafs
-        {
-            typename OctreeClass::Iterator octreeIterator(tree);
-            octreeIterator.gotoBottomLeft();
-            int idxLeaf = 0;
-            do{
-                this->iterArray[idxLeaf++] = octreeIterator;
-            } while(octreeIterator.moveRight());
-        }
-
-        printf("Prepare P2P\n");
-
-        // Box limite
-        const long limite = 1 << (this->OctreeHeight - 1);
-        // pointer to send
-        typename OctreeClass::Iterator* toSend[nbProcess];
-        memset(toSend, 0, sizeof(typename OctreeClass::Iterator*) * nbProcess );
-        int sizeToSend[nbProcess];
-        memset(sizeToSend, 0, sizeof(int) * nbProcess);
-        // index
-        int indexToSend[nbProcess];
-        memset(indexToSend, 0, sizeof(int) * nbProcess);
-
-        // To know if a leaf has been already sent to a proc
-        bool alreadySent[nbProcess];
-
-        for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
-            FTreeCoordinate center;
-            center.setPositionFromMorton(iterArray[idxLeaf].getCurrentGlobalIndex(), OctreeHeight - 1);
-
-            memset(alreadySent, false, sizeof(bool) * nbProcess);
-
-            // We test all cells around
-            for(long idxX = -1 ; idxX <= 1 ; ++idxX){
-                if(!FMath::Between(center.getX() + idxX,0l,limite)) continue;
-
-                for(long idxY = -1 ; idxY <= 1 ; ++idxY){
-                    if(!FMath::Between(center.getY() + idxY,0l,limite)) continue;
-
-                    for(long idxZ = -1 ; idxZ <= 1 ; ++idxZ){
-                        if(!FMath::Between(center.getZ() + idxZ,0l,limite)) continue;
-
-                        // if we are not on the current cell
-                        if( !(!idxX && !idxY && !idxZ) ){
-                            const FTreeCoordinate other(center.getX() + idxX,center.getY() + idxY,center.getZ() + idxZ);
-                            const MortonIndex mortonOther = other.getMortonIndex(this->OctreeHeight - 1);
-
-                            if(mortonOther < intervals[idProcess].min || intervals[idProcess].max < mortonOther){
-                                // find the proc that need this information
-                                int procToReceive = idProcess;
-                                while( procToReceive != 0 && mortonOther < intervals[procToReceive].min){
-                                    --procToReceive;
-                                }
-                                while( procToReceive != nbProcess - 1 && intervals[procToReceive].max < mortonOther){
-                                    ++procToReceive;
-                                }
-
-                                if( !alreadySent[procToReceive] && intervals[procToReceive].min <= mortonOther && mortonOther <= intervals[procToReceive].max){
-                                    alreadySent[procToReceive] = true;
-                                    if(indexToSend[procToReceive] ==  sizeToSend[procToReceive]){
-                                        const int previousSize = sizeToSend[procToReceive];
-                                        sizeToSend[procToReceive] = FMath::Max(50, int(sizeToSend[procToReceive] * 1.5));
-                                        typename OctreeClass::Iterator* temp = toSend[procToReceive];
-                                        toSend[procToReceive] = reinterpret_cast<typename OctreeClass::Iterator*>(new char[sizeof(typename OctreeClass::Iterator) * sizeToSend[procToReceive]]);
-                                        memcpy(toSend[procToReceive], temp, previousSize * sizeof(typename OctreeClass::Iterator));
-                                        delete[] reinterpret_cast<char*>(temp);
-                                    }
-                                    toSend[procToReceive][indexToSend[procToReceive]++] = iterArray[idxLeaf];
-                                }
-                            }
-                        }
-                    }
-                }
-            }
-        }
-
-        int globalReceiveMap[nbProcess * nbProcess];
-        memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess);
-
-        mpiassert( MPI_Allgather( indexToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, MPI_COMM_WORLD),  __LINE__ );
-
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-            printf("indexToSend[%d] = %d\n", idxProc, indexToSend[idxProc]);
-        }
-
-        printf("Will send ...\n");
-
-        // To send in asynchrone way
-        MPI_Request requests[2 * nbProcess];
-        int iterRequest = 0;
-
-        ParticleClass* sendBuffer[nbProcess];
-        memset(sendBuffer, 0, sizeof(ParticleClass*) * nbProcess);        
-
-        ParticleClass* recvBuffer[nbProcess];
-        memset(recvBuffer, 0, sizeof(ParticleClass*) * nbProcess);
-
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-            if(indexToSend[idxProc] != 0){
-                printf("Send %d to %d\n", indexToSend[idxProc], idxProc);
-                sendBuffer[idxProc] = reinterpret_cast<ParticleClass*>(new char[sizeof(ParticleClass) * indexToSend[idxProc]]);
-
-                int currentIndex = 0;
-                for(int idxLeaf = idxProc ; idxLeaf < indexToSend[idxProc] ; ++idxLeaf){
-                    memcpy(&sendBuffer[idxProc][currentIndex], toSend[idxProc][idxLeaf].getCurrentListSrc()->data(),
-                           sizeof(ParticleClass) * toSend[idxProc][idxLeaf].getCurrentListSrc()->getSize() );
-                    currentIndex += toSend[idxProc][idxLeaf].getCurrentListSrc()->getSize();
-                }
-
-                mpiassert( MPI_Isend( sendBuffer[idxProc], sizeof(ParticleClass) * indexToSend[idxProc] , MPI_BYTE ,
-                                     idxProc, TAG_P2P_PART, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
-
-            }
-            if(globalReceiveMap[idxProc * nbProcess + idProcess]){
-                printf("Receive %d from %d\n", globalReceiveMap[idxProc * nbProcess + idProcess], idxProc);
-                recvBuffer[idxProc] = reinterpret_cast<ParticleClass*>(new char[sizeof(ParticleClass) * globalReceiveMap[idxProc * nbProcess + idProcess]]);
-
-                mpiassert( MPI_Irecv(recvBuffer[idxProc], globalReceiveMap[idxProc * nbProcess + idProcess]*sizeof(ParticleClass), MPI_BYTE,
-                                    idxProc, TAG_P2P_PART, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
-            }
-        }
-
-        printf("Wait ...\n");
-        MPI_Waitall(iterRequest, requests, 0);
-
-        printf("Put data in the tree\n");
-        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-            for(int idxPart = 0 ; idxPart < globalReceiveMap[idxProc * nbProcess + idProcess] ; ++idxPart){
-                otherP2Ptree.insert(recvBuffer[idxProc][idxPart]);
-            }
-        }
-
-
-        printf("Delete array\n");
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-            delete [] reinterpret_cast<char*>(sendBuffer[idxProc]);
-            delete [] reinterpret_cast<char*>(recvBuffer[idxProc]);
-            delete [] reinterpret_cast<char*>(toSend[idxProc]);
-        }
-        printf("p2p is finished\n");
-    }
 
     /////////////////////////////////////////////////////////////////////////////
     // Utils functions
@@ -397,28 +249,22 @@ public:
         octreeIterator.moveUp();
         typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
 
-        int previousLeftProc = idProcess;
-        bool getWork = true;
+        int sendToProc = idProcess;
 
-        MPI_Status status[nbProcess];
-        MPI_Request requests[nbProcess];
-        int received[nbProcess];
-        memset(received, 0, sizeof(int) * nbProcess);
+        MPI_Request requests[14];
 
-        char* buffer[nbProcess];
-        memset(buffer, 0, sizeof(char*) * nbProcess);
+        KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
 
-        int sizeBuffer[nbProcess];
-        memset(sizeBuffer, 0, sizeof(int) * nbProcess);
+        const int recvBufferOffset = 8 * sizeof(CellClass) + 1;
+        char sendBuffer[recvBufferOffset];
+        char recvBuffer[nbProcess * recvBufferOffset];
 
         // for each levels
-        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 && getWork ; --idxLevel ){
+        for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
             // We do not touche cells that we are not responsible
-            while( octreeIterator.getCurrentGlobalIndex() < (realIntervalsPerLevel[(idxLevel + 1) * nbProcess + idProcess].min >>3) && (getWork = octreeIterator.moveRight()) ){}
-            if(!getWork) continue;
             printf("at level %d I go from %lld to %lld and the previous level >> 3 min %lld\n",
-                   idxLevel,realIntervalsPerLevel[idxLevel * nbProcess + idProcess].min,
-                   realIntervalsPerLevel[idxLevel * nbProcess + idProcess].max ,realIntervalsPerLevel[(idxLevel + 1) * nbProcess + idProcess].min >>3);
+                   idxLevel,workingIntervalsPerLevel[idxLevel * nbProcess + idProcess].min,
+                   workingIntervalsPerLevel[idxLevel * nbProcess + idProcess].max ,workingIntervalsPerLevel[(idxLevel + 1) * nbProcess + idProcess].min >>3);
 
             // copy cells to work with
             int numberOfCells = 0;
@@ -429,108 +275,97 @@ public:
             avoidGotoLeftIterator.moveUp();
             octreeIterator = avoidGotoLeftIterator;
 
-            int needToSend = 0;
             int iterRequests = 0;
+            bool cellsToSend = false;
 
             // find cells to send
-            if(idProcess != 0){
-                while( needToSend < numberOfCells && iterArray[needToSend].getCurrentGlobalIndex() < realIntervalsPerLevel[idxLevel * nbProcess + idProcess].min ){
-                    ++needToSend;
-                }
-                if(needToSend){
-                    // find the proc to send the data to
-                    while(previousLeftProc != 0 && this->intervals[idxLevel * nbProcess + previousLeftProc - 1].max <= iterArray[needToSend].getCurrentGlobalIndex() ){
-                        --previousLeftProc;
-                    }
-
-                    if(sizeBuffer[previousLeftProc] < int((sizeof(CellClass) * 8 + sizeof(MortonIndex) + 1) * needToSend) ){
-                        sizeBuffer[previousLeftProc] = (sizeof(CellClass) * 8 + sizeof(MortonIndex) + 1) * needToSend;
-                        delete[] buffer[previousLeftProc];
-                        buffer[previousLeftProc] = new char[ sizeBuffer[previousLeftProc] ];
-                    }
-
-                    int cellStartIndex = 0;
-
-                    for(int idxCell = 0 ; idxCell < needToSend ; ++idxCell){
-                        int cellIndex = cellStartIndex + 1;
-
-                        const MortonIndex currentIndex = iterArray[idxCell].getCurrentGlobalIndex();
-                        memcpy(&buffer[previousLeftProc][cellIndex], &currentIndex , sizeof(MortonIndex));
-                        cellIndex += sizeof(MortonIndex);
-
-                        CellClass** const child = iterArray[idxCell].getCurrentChild();
-                        char state = 0x0;
-                        for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
-                            if(child[idxChild]){
-                                state |= (1 << idxChild);
-                                memcpy(&buffer[previousLeftProc][cellIndex], child[idxChild], sizeof(CellClass));
-                                cellIndex += sizeof(CellClass);
-                            }
-                        }
-                        buffer[previousLeftProc][cellStartIndex] = state;
-                        cellStartIndex = cellIndex;
+            if(idProcess != 0
+                    && iterArray[0].getCurrentGlobalIndex() < workingIntervalsPerLevel[idxLevel * nbProcess + idProcess].min){
+                cellsToSend = true;
+                char state = 0;
+                int idxBuff = 1;
+
+                const CellClass* const* const child = iterArray[0].getCurrentChild();
+                for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+                    if( child[idxChild] ){
+                        memcpy(&sendBuffer[idxBuff], child[idxChild], sizeof(CellClass));
+                        idxBuff += sizeof(CellClass);
+                        state |= (0x1 << idxChild);
                     }
+                }
+                sendBuffer[0] = state;
 
-                    printf("I send %d bytes to proc id %d \n", cellStartIndex, previousLeftProc);
-                    MPI_Isend(buffer[previousLeftProc], cellStartIndex, MPI_BYTE, previousLeftProc, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
+                while( sendToProc && iterArray[0].getCurrentGlobalIndex() < workingIntervalsPerLevel[idxLevel * nbProcess + sendToProc].min){
+                    --sendToProc;
                 }
+
+                printf("I send %d bytes to proc id %d \n", idxBuff, sendToProc);
+                MPI_Isend(sendBuffer, idxBuff, MPI_BYTE, sendToProc, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
             }
 
 
+            bool hasToReceive = false;
             int firstProcThatSend = idProcess + 1;
             int endProcThatSend = firstProcThatSend;
-            MortonIndex indexUntilToRecv = realIntervalsPerLevel[idxLevel * nbProcess + idProcess].max;
 
             if(idProcess != nbProcess - 1){
                 while(firstProcThatSend < nbProcess &&
-                      (intervalsPerLevel[(idxLevel+1) * nbProcess + firstProcThatSend].max >> 3) <= realIntervalsPerLevel[idxLevel * nbProcess + idProcess].max ){
+                      workingIntervalsPerLevel[idxLevel * nbProcess + firstProcThatSend].max < workingIntervalsPerLevel[idxLevel * nbProcess + firstProcThatSend].min ){
                     ++firstProcThatSend;
                 }
                 endProcThatSend = firstProcThatSend;
                 while( endProcThatSend < nbProcess &&
-                        (realIntervalsPerLevel[(idxLevel+1) * nbProcess + endProcThatSend].min >> 3) <= intervalsPerLevel[idxLevel * nbProcess + idProcess].max){
+                        realIntervalsPerLevel[idxLevel * nbProcess + firstProcThatSend].min == workingIntervalsPerLevel[idxLevel * nbProcess + idProcess].max){
                     ++endProcThatSend;
                 }
                 if(firstProcThatSend != endProcThatSend){
-                    indexUntilToRecv = realIntervalsPerLevel[idxLevel * nbProcess + firstProcThatSend].max;
+                    hasToReceive = true;
 
                     for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){
-
-                        const int maxToReceive = FMath::Min(realIntervalsPerLevel[idxLevel * nbProcess + idProcess].max,realIntervalsPerLevel[(idxLevel+1) * nbProcess + idxProc].max >> 3)
-                                                - (realIntervalsPerLevel[(idxLevel+1) * nbProcess + idxProc].min >> 3) + 1;
-
-                        if(sizeBuffer[idxProc] < int((sizeof(CellClass) * 8 + sizeof(MortonIndex) + 1) * maxToReceive) ){
-                            sizeBuffer[idxProc] = (sizeof(CellClass) * 8 + sizeof(MortonIndex) + 1) * maxToReceive;
-                            delete[] buffer[idxProc];
-                            buffer[idxProc] = new char[ sizeBuffer[idxProc] ];
-                        }
-
-                        printf("I irecv max %d bytes from proc id %d \n", sizeBuffer[idxProc], idxProc);
-                        MPI_Irecv(buffer[idxProc], sizeBuffer[idxProc], MPI_BYTE, idxProc, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
+                        printf("I irecv from proc id %d \n", idxProc);
+                        MPI_Irecv(&recvBuffer[idxProc * recvBufferOffset], recvBufferOffset, MPI_BYTE, idxProc, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
                     }
                 }
             }
 
-            printf("level %d I need to send %d, need to receive from %d procs\n",idxLevel, needToSend, endProcThatSend - firstProcThatSend);
-
-            KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
-            int idxCell = needToSend;
-            for( ; idxCell < numberOfCells && iterArray[idxCell].getCurrentGlobalIndex() <= indexUntilToRecv ; ++idxCell){
+            for( int idxCell = cellsToSend ; idxCell < (hasToReceive?numberOfCells-1:numberOfCells) ; ++idxCell){
                 myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
             }
 
-            if(iterRequests) MPI_Waitall( iterRequests, requests, status);
+            if(iterRequests){
+                MPI_Waitall( iterRequests, requests, 0);
+
+                if( hasToReceive ){
+                    CellClass* currentChild[8];
+                    memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*));
+
+                    for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){
+                        char state = recvBuffer[idxProc * recvBufferOffset];
+
+                        int position = 0;
+                        int bufferIndex = 1;
+                        while( state && position < 8){
+                            while(!(state & 0x1)){
+                                state >>= 1;
+                                ++position;
+                            }
+
+                            fassert(!currentChild[position], "Already has a cell here", __LINE__, __FILE__);
+                            currentChild[position] = (CellClass*) &recvBuffer[idxProc * recvBufferOffset + bufferIndex];
+                            bufferIndex += sizeof(CellClass);
+
+                            state >>= 1;
+                            ++position;
+                        }
+                    }
 
-            int statusIter = (needToSend ? 1 : 0);
-            for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){
-                MPI_Get_count( &status[statusIter++], MPI_BYTE, &received[idxProc]);
-                printf("Has received %d from %d\n", received[idxProc], idxProc);
+                    printf("Before M2M\n");
+                    myThreadkernels.M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel);
+                    printf("After M2M\n");
+                }
             }
         }
 
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc ){
-            delete[] buffer[idxProc];
-        }
 
         FDEBUG( FDebug::Controller << "\tFinished (@Upward Pass (M2M) = "  << counterTime.tacAndElapsed() << "s)\n" );
         FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
@@ -558,6 +393,35 @@ public:
             FDEBUG(FTic findCounter);
 
 
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.moveDown();
+            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+
+            // for each levels
+            for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
+                int numberOfCells = 0;
+                // for each cells
+                do{
+                    iterArray[numberOfCells] = octreeIterator;
+                    ++numberOfCells;
+                } while(octreeIterator.moveRight());
+                avoidGotoLeftIterator.moveDown();
+                octreeIterator = avoidGotoLeftIterator;
+
+                FDEBUG(computationCounter.tic());
+                #pragma omp parallel
+                {
+                    KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
+                    const CellClass* neighbors[208];
+
+                    #pragma omp for  schedule(dynamic) nowait
+                    for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
+                        const int counter = tree->getDistantNeighbors(neighbors,  iterArray[idxCell].getCurrentGlobalCoordinate(),idxLevel);
+                        if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
+                    }
+                }
+                FDEBUG(computationCounter.tac());
+            }
 
             FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (M2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
             FDEBUG( FDebug::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
@@ -576,6 +440,72 @@ public:
             FDEBUG(FTic sendCounter);
             FDEBUG(FTic receiveCounter);
 
+            // Start from leal level - 1
+            typename OctreeClass::Iterator octreeIterator(tree);
+            octreeIterator.moveDown();
+            typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
+
+            KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
+
+            MPI_Request requests[nbProcess];
+
+            const int heightMinusOne = OctreeHeight - 1;
+
+            // for each levels exepted leaf level
+            for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
+                // copy cells to work with
+                int numberOfCells = 0;
+                // for each cells
+                do{
+                    iterArray[numberOfCells++] = octreeIterator;
+                } while(octreeIterator.moveRight());
+                avoidGotoLeftIterator.moveDown();
+                octreeIterator = avoidGotoLeftIterator;
+
+
+                bool needToRecv = false;
+                int iterRequests = 0;
+
+                // do we need to receive one or zeros cell
+                if(idProcess != 0
+                        && realIntervalsPerLevel[idxLevel * nbProcess + idProcess].min != workingIntervalsPerLevel[idxLevel * nbProcess + idProcess].min){
+                    needToRecv = true;
+                    printf("I recv one cell at index %lld\n", iterArray[0].getCurrentGlobalIndex() );
+                    MPI_Irecv( iterArray[0].getCurrentCell(), sizeof(CellClass), MPI_BYTE, MPI_ANY_SOURCE, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
+                }
+
+
+                if(idProcess != nbProcess - 1){
+                    int firstProcThatRecv = idProcess + 1;
+                    while( firstProcThatRecv < nbProcess &&
+                            workingIntervalsPerLevel[idxLevel * nbProcess + firstProcThatRecv].max < workingIntervalsPerLevel[idxLevel * nbProcess + firstProcThatRecv].min){
+                        ++firstProcThatRecv;
+                    }
+
+                    int endProcThatRecv = firstProcThatRecv;
+                    while( endProcThatRecv < nbProcess &&
+                            (realIntervalsPerLevel[(idxLevel+1) * nbProcess + endProcThatRecv].min >> 3) == iterArray[numberOfCells - 1].getCurrentGlobalIndex()){
+                        ++endProcThatRecv;
+                    }
+                    if(endProcThatRecv != idProcess + 1){
+                        for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){
+                            printf("I send 1 to proc %d at index %lld\n", idxProc, iterArray[numberOfCells - 1].getCurrentGlobalIndex());
+                            MPI_Isend(iterArray[numberOfCells - 1].getCurrentCell(), sizeof(CellClass), MPI_BYTE, idxProc, 0, MPI_COMM_WORLD, &requests[iterRequests++]);
+                        }
+                    }
+                }
+
+                for(int idxCell = (needToRecv ? 1 : 0) ; idxCell < numberOfCells ; ++idxCell){
+                    myThreadkernels.L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
+                }
+
+                if(iterRequests){
+                    MPI_Waitall( iterRequests, requests, 0);
+                    if(needToRecv){
+                        myThreadkernels.L2L( iterArray[0].getCurrentCell() , iterArray[0].getCurrentChild(), idxLevel);
+                    }
+                }
+            }
 
 
             FDEBUG( FDebug::Controller << "\tFinished (@Downward Pass (L2L) = "  << counterTime.tacAndElapsed() << "s)\n" );
@@ -597,6 +527,153 @@ public:
         FDEBUG( FDebug::Controller.write("\tStart Direct Pass\n").write(FDebug::Flush); );
         FDEBUG(FTic counterTime);
 
+        ///////////////////////////////////////////////////
+        // Precompute
+        ///////////////////////////////////////////////////
+        FBoolArray leafsNeedOther(this->numberOfLeafs);
+        OctreeClass otherP2Ptree( tree->getHeight(), tree->getSubHeight(), tree->getBoxWidth(), tree->getBoxCenter() );
+
+        {
+            // Copy leafs
+            {
+                typename OctreeClass::Iterator octreeIterator(tree);
+                octreeIterator.gotoBottomLeft();
+                int idxLeaf = 0;
+                do{
+                    this->iterArray[idxLeaf++] = octreeIterator;
+                } while(octreeIterator.moveRight());
+            }
+
+            printf("Prepare P2P\n");
+
+            // Box limite
+            const long limite = 1 << (this->OctreeHeight - 1);
+            // pointer to send
+            typename OctreeClass::Iterator* toSend[nbProcess];
+            memset(toSend, 0, sizeof(typename OctreeClass::Iterator*) * nbProcess );
+            int sizeToSend[nbProcess];
+            memset(sizeToSend, 0, sizeof(int) * nbProcess);
+            // index
+            int indexToSend[nbProcess];
+            memset(indexToSend, 0, sizeof(int) * nbProcess);
+
+            // To know if a leaf has been already sent to a proc
+            bool alreadySent[nbProcess];
+
+            MortonIndex indexesNeighbors[26];
+
+            for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
+                FTreeCoordinate center;
+                center.setPositionFromMorton(iterArray[idxLeaf].getCurrentGlobalIndex(), OctreeHeight - 1);
+
+                memset(alreadySent, false, sizeof(bool) * nbProcess);
+                bool needOther = false;
+
+                getNeighborsIndexes(iterArray[idxLeaf].getCurrentGlobalIndex(), limite, indexesNeighbors);
+
+                for(int idxNeigh = 0 ; idxNeigh < 26 ; ++idxNeigh){
+                    if(indexesNeighbors[idxNeigh] < intervals[idProcess].min || intervals[idProcess].max < indexesNeighbors[idxNeigh]){
+                        needOther = true;
+
+                        // find the proc that need this information
+                        int procToReceive = idProcess;
+                        while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].min){
+                            --procToReceive;
+                        }
+                        while( procToReceive != nbProcess - 1 && intervals[procToReceive].max < indexesNeighbors[idxNeigh]){
+                            ++procToReceive;
+                        }
+
+                        if( !alreadySent[procToReceive] && intervals[procToReceive].min <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].max){
+                            alreadySent[procToReceive] = true;
+                            if(indexToSend[procToReceive] ==  sizeToSend[procToReceive]){
+                                const int previousSize = sizeToSend[procToReceive];
+                                sizeToSend[procToReceive] = FMath::Max(50, int(sizeToSend[procToReceive] * 1.5));
+                                typename OctreeClass::Iterator* temp = toSend[procToReceive];
+                                toSend[procToReceive] = reinterpret_cast<typename OctreeClass::Iterator*>(new char[sizeof(typename OctreeClass::Iterator) * sizeToSend[procToReceive]]);
+                                memcpy(toSend[procToReceive], temp, previousSize * sizeof(typename OctreeClass::Iterator));
+                                delete[] reinterpret_cast<char*>(temp);
+                            }
+                            toSend[procToReceive][indexToSend[procToReceive]++] = iterArray[idxLeaf];
+                        }
+                    }
+                }
+
+                if(needOther){
+                    leafsNeedOther.set(idxLeaf,true);
+                }
+            }
+
+            int globalReceiveMap[nbProcess * nbProcess];
+            memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess);
+
+            mpiassert( MPI_Allgather( indexToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, MPI_COMM_WORLD),  __LINE__ );
+
+            for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                printf("indexToSend[%d] = %d\n", idxProc, indexToSend[idxProc]);
+            }
+
+            printf("Will send ...\n");
+
+            // To send in asynchrone way
+            MPI_Request requests[2 * nbProcess];
+            int iterRequest = 0;
+
+            ParticleClass* sendBuffer[nbProcess];
+            memset(sendBuffer, 0, sizeof(ParticleClass*) * nbProcess);
+
+            ParticleClass* recvBuffer[nbProcess];
+            memset(recvBuffer, 0, sizeof(ParticleClass*) * nbProcess);
+
+            for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                if(indexToSend[idxProc] != 0){
+                    printf("Send %d to %d\n", indexToSend[idxProc], idxProc);
+                    sendBuffer[idxProc] = reinterpret_cast<ParticleClass*>(new char[sizeof(ParticleClass) * indexToSend[idxProc]]);
+
+                    int currentIndex = 0;
+                    for(int idxLeaf = idxProc ; idxLeaf < indexToSend[idxProc] ; ++idxLeaf){
+                        memcpy(&sendBuffer[idxProc][currentIndex], toSend[idxProc][idxLeaf].getCurrentListSrc()->data(),
+                               sizeof(ParticleClass) * toSend[idxProc][idxLeaf].getCurrentListSrc()->getSize() );
+                        currentIndex += toSend[idxProc][idxLeaf].getCurrentListSrc()->getSize();
+                    }
+
+                    mpiassert( MPI_Isend( sendBuffer[idxProc], sizeof(ParticleClass) * indexToSend[idxProc] , MPI_BYTE ,
+                                         idxProc, TAG_P2P_PART, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
+
+                }
+                if(globalReceiveMap[idxProc * nbProcess + idProcess]){
+                    printf("Receive %d from %d\n", globalReceiveMap[idxProc * nbProcess + idProcess], idxProc);
+                    recvBuffer[idxProc] = reinterpret_cast<ParticleClass*>(new char[sizeof(ParticleClass) * globalReceiveMap[idxProc * nbProcess + idProcess]]);
+
+                    mpiassert( MPI_Irecv(recvBuffer[idxProc], globalReceiveMap[idxProc * nbProcess + idProcess]*sizeof(ParticleClass), MPI_BYTE,
+                                        idxProc, TAG_P2P_PART, MPI_COMM_WORLD, &requests[iterRequest++]) , __LINE__ );
+                }
+            }
+
+            printf("Wait ...\n");
+            MPI_Waitall(iterRequest, requests, 0);
+
+            printf("Put data in the tree\n");
+            for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                for(int idxPart = 0 ; idxPart < globalReceiveMap[idxProc * nbProcess + idProcess] ; ++idxPart){
+                    otherP2Ptree.insert(recvBuffer[idxProc][idxPart]);
+                }
+            }
+
+
+            printf("Delete array\n");
+            for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                delete [] reinterpret_cast<char*>(sendBuffer[idxProc]);
+                delete [] reinterpret_cast<char*>(recvBuffer[idxProc]);
+                delete [] reinterpret_cast<char*>(toSend[idxProc]);
+            }
+            printf("prep2p is finished\n");
+        }
+
+        ///////////////////////////////////////////////////
+        // Direct Computation
+        ///////////////////////////////////////////////////
+
         // init
         const int LeafIndex = OctreeHeight - 1;
         const int SizeShape = 3*3*3;
@@ -664,6 +741,8 @@ public:
             ContainerClass* neighbors[26];
             MortonIndex neighborsIndex[26];
             int previous = 0;
+            // Box limite
+            const long limite = 1 << (this->OctreeHeight - 1);
 
             for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
                 const int endAtThisShape = shapeLeaf[idxShape] + previous;
@@ -672,9 +751,29 @@ public:
                 for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; ++idxLeafs){
                     LeafData& currentIter = leafsDataArray[idxLeafs];
                     myThreadkernels.L2P(currentIter.cell, currentIter.targets);
+
                     // need the current particles and neighbors particles
                     const int counter = tree->getLeafsNeighborsWithIndex(neighbors, neighborsIndex, currentIter.index,LeafIndex);
-                    myThreadkernels.P2P( currentIter.index,currentIter.targets, currentIter.sources , neighbors, neighborsIndex, counter);
+
+                    if(leafsNeedOther.get(idxLeafs)){
+                        MortonIndex indexesNeighbors[26];
+                        getNeighborsIndexes(currentIter.index, limite, indexesNeighbors);
+                        int counterAll = counter;
+                        for(int idxNeigh = 0 ; idxNeigh < 26 ; ++idxNeigh){
+                            if(indexesNeighbors[idxNeigh] < intervals[idProcess].min || intervals[idProcess].max < indexesNeighbors[idxNeigh]){
+                                ContainerClass*const hypotheticNeighbor = otherP2Ptree.getLeafSrc(indexesNeighbors[idxNeigh]);
+                                if(hypotheticNeighbor){
+                                    neighbors[counterAll] = hypotheticNeighbor;
+                                    neighborsIndex[counterAll] = indexesNeighbors[idxNeigh];
+                                    ++counterAll;
+                                }
+                            }
+                        }
+                        myThreadkernels.P2P( currentIter.index,currentIter.targets, currentIter.sources , neighbors, neighborsIndex, counterAll);
+                    }
+                    else{
+                        myThreadkernels.P2P( currentIter.index,currentIter.targets, currentIter.sources , neighbors, neighborsIndex, counter);
+                    }
                 }
 
                 previous = endAtThisShape;
@@ -687,6 +786,30 @@ public:
         FTRACE( FTrace::Controller.leaveFunction(FTrace::FMM) );
     }
 
+
+    void getNeighborsIndexes(const MortonIndex centerIndex, const long limite, MortonIndex indexes[26]){
+        FTreeCoordinate center;
+        center.setPositionFromMorton(centerIndex, OctreeHeight - 1);
+        int idxNeig = 0;
+        // We test all cells around
+        for(long idxX = -1 ; idxX <= 1 ; ++idxX){
+            if(!FMath::Between(center.getX() + idxX,0l, limite)) continue;
+
+            for(long idxY = -1 ; idxY <= 1 ; ++idxY){
+                if(!FMath::Between(center.getY() + idxY,0l, limite)) continue;
+
+                for(long idxZ = -1 ; idxZ <= 1 ; ++idxZ){
+                    if(!FMath::Between(center.getZ() + idxZ,0l, limite)) continue;
+
+                    // if we are not on the current cell
+                    if( idxX || idxY || idxZ ){
+                        const FTreeCoordinate other(center.getX() + idxX,center.getY() + idxY,center.getZ() + idxZ);
+                        indexes[idxNeig++] = other.getMortonIndex(this->OctreeHeight - 1);
+                    }
+                }
+            }
+        }
+    }
 };