diff --git a/CMakeLists.txt b/CMakeLists.txt
index e5a962af52497796a27cd1d74c88be77996f3133..e83c8d19852282c95ae82281abeca8079fd67937 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -455,7 +455,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
         message( FATAL_ERROR "nvcc is needed with CUDA." )
       endif()
       if(NOT DEFINED CUSTOM_CUDA_FLAGS)
-        set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20" CACHE
+        set( CUSTOM_CUDA_FLAGS "-std=c++11;-arch=sm_20;-ptxas-options=-v;-use_fast_math" CACHE
           STRING "Set your CUDA flags, for example : -arch=sm_20;-ptxas-options=-v;-use_fast_math")
       endif()
       # This is needed to remove backslash after space in ADD_CUSTOM_COMMAND
diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
index 8cdec810bdeddabad5aa10dbd72c4e712bd85113..53ff745ce694be40ec5ff9924b1c2dafe9ad77b3 100644
--- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
+++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
@@ -1093,3 +1093,181 @@ template void FCuda__ReleaseCudaKernel<FCudaP2P<double>>(FCudaP2P<double>* cuker
 
 template dim3 FCuda__GetGridSize< FCudaP2P<double> >(FCudaP2P<double>* kernel, int intervalSize);
 template dim3 FCuda__GetBlockSize< FCudaP2P<double> >(FCudaP2P<double>* cukernel);
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/////////////////////////////////////////////////////////////////////////////////////
+
+#include "../Uniform/FUnifCuda.hpp"
+
+template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
+unsigned char* containersPtr, std::size_t containersSize,
+    FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+    int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#ifdef SCALFMM_USE_MPI
+template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
+    unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
+    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+    int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#endif
+template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
+    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+    int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#ifdef SCALFMM_USE_MPI
+template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    unsigned char* externalContainersPtr, std::size_t externalContainersSize,
+    const OutOfBlockInteraction* outsideInteractions,
+    int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell,
+const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#endif
+template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
+const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
+const int     safeOuterInteractions[], const int counterOuterCell,
+    const OutOfBlockInteraction* insideInteractions,
+    const int     safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FUnifCuda<float> >
+    (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
+    unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    FUnifCuda<float>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template FUnifCuda<float>* FCuda__BuildCudaKernel<FUnifCuda<float>>(void* kernel);
+template void FCuda__ReleaseCudaKernel<FUnifCuda<float>>(FUnifCuda<float>* cukernel);
+
+template dim3 FCuda__GetGridSize< FUnifCuda<float> >(FUnifCuda<float>* kernel, int intervalSize);
+template dim3 FCuda__GetBlockSize< FUnifCuda<float> >(FUnifCuda<float>* cukernel);
+
+
+
+
+template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
+unsigned char* containersPtr, std::size_t containersSize,
+    FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#ifdef SCALFMM_USE_MPI
+template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
+    unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
+    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+    int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#endif
+template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
+    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+    int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#ifdef SCALFMM_USE_MPI
+template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    unsigned char* externalContainersPtr, std::size_t externalContainersSize,
+    const OutOfBlockInteraction* outsideInteractions,
+    int nbOutsideInteractions, const int safeOuterInteractions[], const int counterOuterCell,
+const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+#endif
+template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
+const OutOfBlockInteraction* outsideInteractions, int nbOutsideInteractions,
+const int     safeOuterInteractions[], const int counterOuterCell,
+    const OutOfBlockInteraction* insideInteractions,
+    const int     safeInnterInteractions[], const int counterInnerCell, const int treeHeight, FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FUnifCuda<double> >
+    (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
+    unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
+    FUnifCuda<double>* kernel, cudaStream_t currentStream,
+                                        const dim3 inGridSize, const dim3 inBlocksSize);
+
+template FUnifCuda<double>* FCuda__BuildCudaKernel<FUnifCuda<double>>(void* kernel);
+template void FCuda__ReleaseCudaKernel<FUnifCuda<double>>(FUnifCuda<double>* cukernel);
+
+template dim3 FCuda__GetGridSize< FUnifCuda<double> >(FUnifCuda<double>* kernel, int intervalSize);
+template dim3 FCuda__GetBlockSize< FUnifCuda<double> >(FUnifCuda<double>* cukernel);
diff --git a/Src/GroupTree/Uniform/FUnifCuda.hpp b/Src/GroupTree/Uniform/FUnifCuda.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..ad97f2efd14aace14b99b77c7c284210b1328056
--- /dev/null
+++ b/Src/GroupTree/Uniform/FUnifCuda.hpp
@@ -0,0 +1,342 @@
+#ifndef FUNIFCUDA_HPP
+#define FUNIFCUDA_HPP
+
+#include "../Cuda/FCudaGlobal.hpp"
+#include "../Cuda/FCudaGroupAttachedLeaf.hpp"
+#include "../Cuda/FCudaEmptyCellSymb.hpp"
+#include "../Cuda/FCudaCompositeCell.hpp"
+
+#include "FUnifCellPOD.hpp"
+
+#define Min(x,y) ((x)<(y)?(x):(y))
+#define Max(x,y) ((x)>(y)?(x):(y))
+
+/**
+ * This class defines what should be a Cuda kernel.
+ */
+template <class FReal>
+class FUnifCuda {
+protected:
+public:
+
+    __device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const  FReal& targetPhys,
+                           FReal& forceX, FReal& forceY,FReal&  forceZ, FReal& potential,
+                           const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const {
+        FReal dx = sourcesX - targetX;
+        FReal dy = sourcesY - targetY;
+        FReal dz = sourcesZ - targetZ;
+
+        FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
+        FReal inv_distance = sqrt(inv_square_distance);
+
+        inv_square_distance *= inv_distance;
+        inv_square_distance *= targetPhys * sourcesPhys;
+
+        dx *= inv_square_distance;
+        dy *= inv_square_distance;
+        dz *= inv_square_distance;
+
+        forceX += dx;
+        forceY += dy;
+        forceZ += dz;
+        potential += inv_distance * sourcesPhys;
+    }
+
+    static double DSqrt(const double val){
+        return sqrt(val);
+    }
+
+    static float FSqrt(const float val){
+        return sqrtf(val);
+    }
+
+    typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
+    typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass;
+
+    static const int NB_THREAD_GROUPS = 30; // 2 x 15
+    static const int THREAD_GROUP_SIZE = 256;
+    static const int SHARED_MEMORY_SIZE = 512;// 49152
+
+    __device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
+    }
+
+    __device__ void M2M(CellClass  /*pole*/, const CellClass  /*child*/[8], const int /*level*/) {
+    }
+
+    __device__ void M2L(CellClass  /*pole*/, const CellClass* /*distantNeighbors*/,
+                        const int* /*neighPositions*/,
+                        const int /*size*/, const int /*level*/) {
+    }
+
+    __device__ void L2L(const CellClass  /*local*/, CellClass  /*child*/[8], const int /*level*/) {
+    }
+
+    __device__ void L2P(const CellClass  /*local*/, ContainerClass*const /*particles*/){
+    }
+
+    __device__ void P2P(const int3& pos,
+                        ContainerClass* const  targets, const ContainerClass* const  sources,
+                        ContainerClass* const directNeighborsParticles,
+                        const int* neighborPositions, const int counter){
+        // Compute with other
+        P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter);
+        // Compute inside
+        const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
+        for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
+            const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
+            const bool threadCompute = (idxPart < targets->getNbParticles());
+
+            FReal targetX, targetY, targetZ, targetPhys;
+            FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+
+            targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+            targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+            targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+            targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
+
+            for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
+                __shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
+                __shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
+                __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
+                __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
+
+                const int nbCopies = Min(SHARED_MEMORY_SIZE, targets->getNbParticles()-idxCopy);
+                for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
+                    sourcesX[idx] = targets->getPositions()[0][idx+idxCopy];
+                    sourcesY[idx] = targets->getPositions()[1][idx+idxCopy];
+                    sourcesZ[idx] = targets->getPositions()[2][idx+idxCopy];
+                    sourcesPhys[idx] = targets->getAttribute(0)[idx+idxCopy];
+                }
+
+                __syncthreads();
+
+                if(threadCompute){
+                    int leftCopies = nbCopies;
+                    if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){
+                        leftCopies = idxPart - idxCopy;
+                    }
+
+                    // Left Part
+                    for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                    }
+
+                    for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                    }
+                    // Right Part
+                    for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                    }
+
+                    for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                    }
+                }
+
+                __syncthreads();
+            }
+
+            if( threadCompute ){
+                targets->getAttribute(1)[idxPart] += potential;
+                targets->getAttribute(2)[idxPart] += forceX;
+                targets->getAttribute(3)[idxPart] += forceY;
+                targets->getAttribute(4)[idxPart] += forceZ;
+            }
+
+            __syncthreads();
+        }
+    }
+
+    __device__ void P2PRemote(const int3& ,
+                              ContainerClass* const  targets, const ContainerClass* const  /*sources*/,
+                              ContainerClass* const directNeighborsParticles,
+                              const int* /*neighborsPositions*/, const int counter){
+        for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+
+            const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
+            for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
+                const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
+                const bool threadCompute = (idxPart < targets->getNbParticles());
+
+                FReal targetX, targetY, targetZ, targetPhys;
+                FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+
+                targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+                targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+                targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+                targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
+
+                for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
+                    __shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
+
+                    const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
+                    for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
+                        sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy];
+                        sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy];
+                        sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy];
+                        sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy];
+                    }
+
+                    __syncthreads();
+
+                    if(threadCompute){
+                        for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                        }
+
+                        for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        }
+                    }
+
+                    __syncthreads();
+                }
+
+                if( threadCompute ){
+                    targets->getAttribute(1)[idxPart] += potential;
+                    targets->getAttribute(2)[idxPart] += forceX;
+                    targets->getAttribute(3)[idxPart] += forceY;
+                    targets->getAttribute(4)[idxPart] += forceZ;
+                }
+
+
+                __syncthreads();
+            }
+        }
+    }
+
+    __device__ void P2POuter(const int3& ,
+                             ContainerClass* const  targets,
+                             ContainerClass* const directNeighborsParticles,
+                             const int* /*neighborsPositions*/, const int counter){
+        for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+
+            const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
+            for(int idxLoop = 0 ; idxLoop < nbLoops ; ++idxLoop){
+                const int idxPart = (idxLoop*blockDim.x)+threadIdx.x;
+                const bool threadCompute = (idxPart < targets->getNbParticles());
+
+                FReal targetX, targetY, targetZ, targetPhys;
+                FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+
+                targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+                targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+                targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+                targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
+
+                for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARED_MEMORY_SIZE){
+                    __shared__ FReal sourcesX[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesY[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesZ[SHARED_MEMORY_SIZE];
+                    __shared__ FReal sourcesPhys[SHARED_MEMORY_SIZE];
+
+                    const int nbCopies = Min(SHARED_MEMORY_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
+                    for(int idx = threadIdx.x ; idx < nbCopies ; idx += blockDim.x){
+                        sourcesX[idx] = directNeighborsParticles[idxNeigh].getPositions()[0][idx+idxCopy];
+                        sourcesY[idx] = directNeighborsParticles[idxNeigh].getPositions()[1][idx+idxCopy];
+                        sourcesZ[idx] = directNeighborsParticles[idxNeigh].getPositions()[2][idx+idxCopy];
+                        sourcesPhys[idx] = directNeighborsParticles[idxNeigh].getAttribute(0)[idx+idxCopy];
+                    }
+
+                    __syncthreads();
+
+                    if(threadCompute){
+                        for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                        }
+
+                        for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        }
+                    }
+
+                    __syncthreads();
+                }
+
+                if( threadCompute ){
+                    targets->getAttribute(1)[idxPart] += potential;
+                    targets->getAttribute(2)[idxPart] += forceX;
+                    targets->getAttribute(3)[idxPart] += forceY;
+                    targets->getAttribute(4)[idxPart] += forceZ;
+                }
+
+                __syncthreads();
+            }
+        }
+    }
+
+    __host__ static FUnifCuda* InitKernelKernel(void*){
+        return nullptr;
+    }
+
+    __host__ static void ReleaseKernel(FUnifCuda* /*todealloc*/){
+        // nothing to do
+    }
+
+    __host__ static dim3 GetGridSize(const int /*intervalSize*/){
+        return NB_THREAD_GROUPS;
+    }
+
+    __host__ static dim3 GetBlocksSize(){
+        return THREAD_GROUP_SIZE;
+    }
+};
+
+#endif // FUNIFCUDA_HPP
+
diff --git a/Src/Kernels/Uniform/FUnifTensor.hpp b/Src/Kernels/Uniform/FUnifTensor.hpp
index a3895283d30738c1b16a3b6150b3e4be9cc9c350..ab233ab57a03f2b684312c38744a70e1fa9e9ff5 100644
--- a/Src/Kernels/Uniform/FUnifTensor.hpp
+++ b/Src/Kernels/Uniform/FUnifTensor.hpp
@@ -18,7 +18,7 @@
 #ifndef FUNIFTENSOR_HPP
 #define FUNIFTENSOR_HPP
 
-#include "Utils/FMath.hpp"
+#include "../../Utils/FMath.hpp"
 
 #include "FUnifRoots.hpp"
 #include "../Interpolation/FInterpTensor.hpp"
diff --git a/Tests/GroupTree/testBlockedUnifCuda.cpp b/Tests/GroupTree/testBlockedUnifCuda.cpp
index df87193cfa8c3641f3e9796081ab1b00f8ca3c00..63380c7f15a0ae63fe084c9c485a1c4fbff3fac3 100644
--- a/Tests/GroupTree/testBlockedUnifCuda.cpp
+++ b/Tests/GroupTree/testBlockedUnifCuda.cpp
@@ -47,7 +47,7 @@
 #include <memory>
 
 template <class FReal>
-class FCudaP2P;
+class FUnifCuda;
 
 //#define RANDOM_PARTICLES
 
@@ -83,7 +83,7 @@ int main(int argc, char* argv[]){
     typedef FStarPUCudaWrapper<GroupKernelClass,
             FCudaEmptyCellSymb, int, int,
             FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-            FCudaGroupOfParticles<FReal, 4, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 4, 4, FReal>, FCudaP2P<FReal> > GroupCudaWrapper;
+            FCudaGroupOfParticles<FReal, 1, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 1, 4, FReal>, FUnifCuda<FReal> > GroupCudaWrapper;
 
     typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass,
             GroupCpuWrapper, GroupCudaWrapper > GroupAlgorithm;