diff --git a/Src/GroupTree/Core/FGroupOfParticles.hpp b/Src/GroupTree/Core/FGroupOfParticles.hpp
index 93478fc5ab679e466e6edfd4e8b8911a21631709..4225095238331ac3e9266d3aea9632163c061ff8 100644
--- a/Src/GroupTree/Core/FGroupOfParticles.hpp
+++ b/Src/GroupTree/Core/FGroupOfParticles.hpp
@@ -68,8 +68,6 @@ protected:
     BlockHeader*    blockHeader;
     //< Pointer to leaves information
     LeafHeader*     leafHeader;
-    //< The total number of particles in the group
-    const FSize nbParticlesInGroup;
 
     //< Pointers to particle position x, y, z
     FReal* particlePosition[3];
@@ -92,7 +90,7 @@ public:
     FGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
                       unsigned char* inAttributes)
         : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
-          blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
+          blockHeader(nullptr), leafHeader(nullptr),
           attributesBuffer(nullptr), deleteBuffer(false){
         // Move the pointers to the correct position
         blockHeader         = reinterpret_cast<BlockHeader*>(inBuffer);
@@ -127,12 +125,12 @@ public:
  * @param inNumberOfLeaves total number of leaves in the interval (should be <= inEndingIndex-inEndingIndex)
  */
     FGroupOfParticles(const MortonIndex inStartingIndex, const MortonIndex inEndingIndex, const int inNumberOfLeaves, const FSize inNbParticles)
-        : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(inNbParticles),
+        : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr),
           deleteBuffer(true){
         memset(particlePosition, 0, sizeof(particlePosition));
         memset(particleAttributes, 0, sizeof(particleAttributes));
 
-        const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(nbParticlesInGroup+(MemoryAlignementParticles-1)*inNumberOfLeaves);
+        const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(inNbParticles+(MemoryAlignementParticles-1)*inNumberOfLeaves);
 
         // Find the number of leaf to allocate in the blocks
         FAssertLF((inEndingIndex-inStartingIndex) >= MortonIndex(inNumberOfLeaves));
@@ -161,6 +159,7 @@ public:
         blockHeader->endingIndex   = inEndingIndex;
         blockHeader->numberOfLeavesInBlock  = inNumberOfLeaves;
         blockHeader->nbParticlesAllocatedInGroup = nbParticlesAllocatedInGroup;
+        blockHeader->nbParticlesInGroup = inNbParticles;
 
         // Init particle pointers
         blockHeader->positionsLeadingDim = (sizeof(FReal) * nbParticlesAllocatedInGroup);
@@ -247,7 +246,7 @@ public:
 
     /** Get the total number of particles in the group */
     FSize getNbParticlesInGroup() const {
-        return nbParticlesInGroup;
+        return blockHeader->nbParticlesInGroup;
     }
 
     /** The size of the interval endingIndex-startingIndex (set from the constructor) */
diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
index eba5a5cd192f366a62416b36e8132d0252d433fc..562e3789b23b31ec0cca5ed1df13a9e21d380293 100644
--- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
@@ -250,6 +250,27 @@ public:
 #endif
     }
 
+    void syncData(){
+        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
+            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
+            }
+        }
+        {
+            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
+                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].symb);
+                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].down);
+            }
+        }
+    }
+
     ~FGroupTaskStarPUAlgorithm(){
         starpu_resume();
 
@@ -338,6 +359,11 @@ protected:
 
         FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
         starpu_task_wait_for_all();
+
+        FLOG( FTic timerSync; );
+        syncData();
+        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
+
         starpu_pause();
 
 #ifdef STARPU_USE_CPU
diff --git a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
index fb44ebdc6d04a9c15e2da2b631fdc8563613803d..2dc3e070d1c71216829fbb57a04a5887e4762eec 100644
--- a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
@@ -265,6 +265,27 @@ public:
 #endif
     }
 
+    void syncData(){
+        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
+            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
+            }
+        }
+        {
+            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
+                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].symb);
+                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].down);
+            }
+        }
+    }
+
     ~FGroupTaskStarPUMpiAlgorithm(){
         starpu_resume();
 
@@ -324,6 +345,7 @@ public:
     }
 
 protected:
+
     /**
       * Runs the complete algorithm.
       */
@@ -362,6 +384,11 @@ protected:
 #endif
 
         starpu_task_wait_for_all();
+
+        FLOG( FTic timerSync; );
+        syncData();
+        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
+
         starpu_pause();
 
 #ifdef STARPU_USE_CPU
diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
index cd1980ca2599ea422155b335674bc33e093c652b..4d8ff95c103fb033fc7a943bc2a3a84d85cbdfe6 100644
--- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
+++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
@@ -31,12 +31,17 @@ __global__ void FCuda__bottomPassPerform(unsigned char* leafCellsPtr, std::size_
                                          CudaKernelClass* kernel){
     CellContainerClass leafCells(leafCellsPtr, leafCellsSize, leafCellsUpPtr, nullptr);
     ParticleContainerGroupClass containers(containersPtr, containersSize, nullptr);
+    printf("containers.getNbParticlesInGroup() %ld \n", containers.getNbParticlesInGroup());
+    printf("containers.getSizeOfInterval() %ld \n", containers.getSizeOfInterval());
+    printf("containers.getStartingIndex() %lld \n", containers.getStartingIndex());
+    printf("containers.getEndingIndex() %lld \n", containers.getEndingIndex());
 
     for(int leafIdx = blockIdx.x ; leafIdx < leafCells.getNumberOfCellsInBlock() ; leafIdx += gridDim.x){
         typename CellContainerClass::CompleteCellClass cell = leafCells.getUpCell(leafIdx);
         ParticleGroupClass particles = containers.template getLeaf<ParticleGroupClass>(leafIdx);
         FCudaAssertLF(leafCells.getCellMortonIndex(leafIdx) == containers.getLeafMortonIndex(leafIdx));
         kernel->P2M(cell, &particles);
+        printf("particles.getNbParticles() %lld \n", particles.getNbParticles());
     }
 }
 
@@ -65,70 +70,48 @@ __host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-                                         FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-                                         FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-                                         int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){
+                                         unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+                                         int idxLevel, CudaKernelClass* kernel){
     CellContainerClass currentCells(currentCellsPtr, currentCellsSize,currentCellsUpPtr,nullptr);
-    CellContainerClass subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], subCellGroupsUpPtr.values[idx], nullptr);
-    }
+    CellContainerClass subCellGroup(childCellsPtr, childCellsSize,childCellsUpPtr,nullptr);
 
-    const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
-    const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
+    const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3);
+    const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3);
 
-    if(firstCell == currentCells.getNumberOfCellsInBlock()){
-        return ;
-    }
+    int idxParentCell = currentCells.getCellIndex(firstParent);
+    int idxChildCell = subCellGroup.getFistChildIdx(firstParent);
 
-    FCudaAssertLF(nbSubCellGroups != 0);
-    int idxSubCellGroup = 0;
-    int idxChildCell = 0;
-    {// Find first child
-        const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell);
-        while(idxSubCellGroup != nbSubCellGroups
-              && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){
-            idxSubCellGroup += 1;
-        }
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
-        idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0));
-    }
-    FCudaAssertLF(idxChildCell != -1);
-
-    for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){
-        typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(cellIdx);
-        FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx));
+    while(true){
+        typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(idxParentCell);
         typename CellContainerClass::CompleteCellClass child[8];
 
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
 
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = nullptr;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7);
-            FCudaAssertLF(child[idxChild].symb == nullptr);
-            child[idxChild] = subCellGroups[idxSubCellGroup].getUpCell(idxChildCell);
+        do{
+            const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7);
+            child[idxChild] = subCellGroup.getUpCell(idxChildCell);
 
             idxChildCell += 1;
-            if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
-        }
+        }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3));
 
         kernel->M2M(cell, child, idxLevel);
+
+        if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
     }
 }
 
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-                                        FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-                                        FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-                                        int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
+                                        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+                                        int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
 
     FCuda__upwardPassPerform
@@ -136,8 +119,8 @@ __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::siz
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>
                             (currentCellsPtr, currentCellsSize,currentCellsUpPtr,
-                             subCellGroupsPtr, subCellGroupsSize,subCellGroupsUpPtr,
-                             nbSubCellGroups, idxLevel, kernel);
+                             childCellsPtr, childCellsSize,childCellsUpPtr,
+                             idxLevel, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
 }
@@ -272,17 +255,17 @@ __host__ void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std:
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-                                                unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+                                                unsigned char* currentCellsDownPtr,
                                                 unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-                                                unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-                                                int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+                                                unsigned char* externalCellsUpPtr,
+                                                int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
                                                 int nbOutsideInteractions, CudaKernelClass* kernel){
     if(blockIdx.x != 0){
         return;
     }
 
-    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
-    CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr);
+    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
+    CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr);
 
     for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
         const int cellPos = cellsOther.getCellIndex(outsideInteractions[outInterIdx].outIndex);
@@ -310,10 +293,10 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr,
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-                                               unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+                                               unsigned char* currentCellsDownPtr,
                                                unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-                                               unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-                                               int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+                                               unsigned char* externalCellsUpPtr,
+                                               int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
                                                int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
     OutOfBlockInteraction* cuOutsideInteractions;
@@ -326,10 +309,10 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
             <SymboleCellClass, PoleCellClass, LocalCellClass,
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>(currentCellsPtr, currentCellsSize,
-                                                                currentCellsUpPtr, currentCellsDownPtr,
+                                                                currentCellsDownPtr,
                                                                 externalCellsPtr, externalCellsSize,
-                                                                externalCellsUpPtr, externalCellsDownPtr,
-                                                                idxLevel, cuOutsideInteractions,
+                                                                externalCellsUpPtr,
+                                                                idxLevel, mode, cuOutsideInteractions,
                                                                 nbOutsideInteractions, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
@@ -345,77 +328,56 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-                                          FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-                                          FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-                                          int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){
-    FCudaAssertLF(nbSubCellGroups != 0);
-    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
-    CellContainerClass subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], nullptr, subCellGroupsDownPtr.values[idx]);
-    }
+                                          unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+                                          int idxLevel, CudaKernelClass* kernel){
+    CellContainerClass currentCells(currentCellsPtr, currentCellsSize,nullptr,currentCellsDownPtr);
+    CellContainerClass subCellGroup(childCellsPtr, childCellsSize,nullptr,childCellsDownPtr);
 
-    const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
-    const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
+    const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3);
+    const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3);
 
-    if(firstCell == currentCells.getNumberOfCellsInBlock()){
-        return ;
-    }
+    int idxParentCell = currentCells.getCellIndex(firstParent);
+    int idxChildCell = subCellGroup.getFistChildIdx(firstParent);
 
-    FCudaAssertLF(nbSubCellGroups != 0);
-    int idxSubCellGroup = 0;
-    int idxChildCell = 0;
-    {// Find first child
-        const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell);
-        while(idxSubCellGroup != nbSubCellGroups
-              && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){
-            idxSubCellGroup += 1;
-        }
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
-        idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0));
-    }
-    FCudaAssertLF(idxChildCell != -1);
-
-    for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){
-        typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx);
-        FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx));
+    while(true){
+        typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(idxParentCell);
         typename CellContainerClass::CompleteCellClass child[8];
 
+
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = nullptr;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7);
-            FCudaAssertLF(child[idxChild].symb == nullptr);
-            child[idxChild] = subCellGroups[idxSubCellGroup].getDownCell(idxChildCell);
+        do{
+            const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7);
+            child[idxChild] = subCellGroup.getDownCell(idxChildCell);
 
             idxChildCell += 1;
-            if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
-        }
+        }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3));
 
         kernel->L2L(cell, child, idxLevel);
+
+        if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
     }
 }
 
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-                                         FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-                                         FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-                                         int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
+                                        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+                                         int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
 
     FCuda__downardPassPerform
             <SymboleCellClass, PoleCellClass, LocalCellClass,
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>
-            (currentCellsPtr, currentCellsSize, currentCellsDownPtr, subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr,
-             nbSubCellGroups, idxLevel, kernel);
+            (currentCellsPtr, currentCellsSize, currentCellsDownPtr, childCellsPtr, childCellsSize, childCellsDownPtr,
+             idxLevel, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
 }
@@ -682,9 +644,8 @@ template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroup
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -705,19 +666,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -770,9 +730,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -793,19 +752,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData,
 template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -856,9 +814,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -879,19 +836,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData,
 template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+    int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -945,9 +901,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+    int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -968,19 +923,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -1031,9 +985,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -1054,19 +1007,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
index 5559c908570cc75388ecfa1c2018ab4e98d53c59..4ba6615534e597cca892fbd02ce44a5c93a6e8cf 100644
--- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
+++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
@@ -17,9 +17,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__upwardPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
+        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+     int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
@@ -43,10 +42,10 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__transferInoutPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+    unsigned char* currentCellsUpPtr,
     unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+    unsigned char* externalCellsDownPtr,
+    int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
     int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
@@ -54,9 +53,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__downardPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
+        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+        int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
diff --git a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
index d7d8aff5cea8627d3ad487b3e1286fc1af7bed2b..a119cc22ed8b35c53fc0fc835a621fb1ce15e95f 100644
--- a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
+++ b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
@@ -59,8 +59,6 @@ protected:
     BlockHeader*    blockHeader;
     //< Pointer to leaves information
     LeafHeader*     leafHeader;
-    //< The total number of particles in the group
-    const FSize nbParticlesInGroup;
 
     //< Pointers to particle position x, y, z
     FReal* particlePosition[3];
@@ -78,11 +76,12 @@ public:
     __device__ FCudaGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
                                      unsigned char* inAttributes)
         : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
-          blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
+          blockHeader(nullptr), leafHeader(nullptr),
           attributesBuffer(nullptr){
         // Move the pointers to the correct position
-        blockHeader         = reinterpret_cast<BlockHeader*>(memoryBuffer);
-        leafHeader          = reinterpret_cast<LeafHeader*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->numberOfLeavesInBlock*sizeof(int)));
+        blockHeader         = reinterpret_cast<BlockHeader*>(inBuffer);
+        inBuffer += sizeof(BlockHeader);
+        leafHeader          = reinterpret_cast<LeafHeader*>(inBuffer);
 
         // Init particle pointers
         // Assert blockHeader->positionsLeadingDim == (sizeof(FReal) * blockHeader->nbParticlesAllocatedInGroup);
@@ -122,7 +121,7 @@ public:
 
     /** Get the total number of particles in the group */
     __device__ FSize getNbParticlesInGroup() const {
-        return nbParticlesInGroup;
+        return blockHeader->nbParticlesInGroup;
     }
 
     /** The size of the interval endingIndex-startingIndex (set from the constructor) */
diff --git a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
index b0b2679d12ec8c5b8feda2a6cdc8f2ad4c7580ae..5e438872e83e205ad5fdc3638c822eae068d6c2f 100644
--- a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
+++ b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
@@ -96,21 +96,9 @@ public:
         FStarPUPtrInterface* worker = nullptr;
         int nbSubCellGroups = 0;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        FCudaParams<unsigned char*,9> subCellGroupsPtr;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
-        FCudaParams<std::size_t,9> subCellGroupsSize;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
-        FCudaParams<unsigned char*,9> subCellGroupsUpPtr;
-        memset(&subCellGroupsUpPtr, 0, sizeof(subCellGroupsUpPtr));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]);
-            subCellGroupsUpPtr.values[idxSubGroup] = (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]);
-        }
-
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
         FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
@@ -118,8 +106,10 @@ public:
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                 STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                 (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
-                subCellGroupsPtr,subCellGroupsSize,subCellGroupsUpPtr,
-                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
+                idxLevel, kernel, starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
 
@@ -130,8 +120,8 @@ public:
     static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        const std::vector<OutOfBlockInteraction>* outsideInteractions;
-        int intervalSize;
+        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -156,7 +146,7 @@ public:
     static void transferInPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -174,9 +164,10 @@ public:
     static void transferInoutPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        const std::vector<OutOfBlockInteraction>* outsideInteractions;
-        int intervalSize;
-        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
+        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
+        int intervalSize = 0;
+        int mode = 0;
+        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
@@ -186,11 +177,9 @@ public:
                     STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
-                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]),
-                    (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[4]),
-                    (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[5]),
-                    idxLevel, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
+                    idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
                     starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
@@ -202,21 +191,9 @@ public:
         FStarPUPtrInterface* worker = nullptr;
         int nbSubCellGroups = 0;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        FCudaParams<unsigned char*,9> subCellGroupsPtr;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
-        FCudaParams<std::size_t,9> subCellGroupsSize;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
-        FCudaParams<unsigned char*,9> subCellGroupsDownPtr;
-        memset(&subCellGroupsDownPtr, 0, sizeof(subCellGroupsDownPtr));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsDownPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]));
-        }
-
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
         FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
@@ -224,8 +201,10 @@ public:
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                 STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                 (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
-                subCellGroupsPtr,subCellGroupsSize,subCellGroupsDownPtr,
-                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
+                idxLevel, kernel, starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
     /////////////////////////////////////////////////////////////////////////////////////
@@ -237,7 +216,7 @@ public:
 
         FStarPUPtrInterface* worker = nullptr;
         const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -260,7 +239,7 @@ public:
 
     static void directInPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
@@ -276,7 +255,7 @@ public:
     static void directInoutPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
diff --git a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
index 23325ea829ea174b4d1fa7946d24895c2e1b4259..5cd96a6d0aec89451aa46111752cf4a71656310e 100644
--- a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
+++ b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
@@ -18,6 +18,8 @@ public:
         // the pole represents all particles under
         if(threadIdx.x == 0){
             *pole.up += particles->getNbParticles();
+            printf("*pole.up %d += particles->getNbParticles(); %d \n",
+                   *pole.up , particles->getNbParticles());
         }
     }
 
diff --git a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
index 07015cfa8232431bddcfe78a771817f07437114e..522418cb77d05623f62cc10ed0d6808a13f863d4 100644
--- a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
+++ b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
@@ -80,7 +80,7 @@ int main(int argc, char* argv[]){
     typedef FGroupTestParticleContainer<FReal>                                     GroupContainerClass;
     typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,
             GroupContainerClass, 0, 1, long long int>  GroupOctreeClass;
-    typedef FStarPUAllCpuCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >>  GroupKernelClass;
+    typedef FStarPUAllCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >>  GroupKernelClass;
 
     typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
     typedef FStarPUCudaWrapper<GroupKernelClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,