diff --git a/Addons/CKernelApi/CMakeLists.txt b/Addons/CKernelApi/CMakeLists.txt
index 07083e44d024f53e042543b6413adad7bb5a9fa4..9cbc5a189271ebbfdf33b35dcced90828235fd8f 100644
--- a/Addons/CKernelApi/CMakeLists.txt
+++ b/Addons/CKernelApi/CMakeLists.txt
@@ -29,8 +29,8 @@ if(SCALFMM_ADDON_CKERNELAPI)
 
         # Adding the entire project dir as an include dir
         INCLUDE_DIRECTORIES(
-         ${CMAKE_BINARY_DIR}/Src
-         ${CMAKE_SOURCE_DIR}/Src
+         ${SCALFMM_BINARY_DIR}/Src
+         ${SCALFMM_SOURCE_DIR}/Src
          ${SCALFMM_INCLUDES}
     )
 
@@ -44,7 +44,7 @@ if(SCALFMM_ADDON_CKERNELAPI)
         INSTALL( FILES ${hpp_in_dir} DESTINATION include/ScalFmm/CKernelApi )
 
         file( GLOB_RECURSE source_tests_files Tests/*.c )
-        INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src )
+        INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src )
 
         # Then build test files
         foreach(exec ${source_tests_files})
diff --git a/Addons/FmmApi/CMakeLists.txt b/Addons/FmmApi/CMakeLists.txt
index 2ad1402ef04aa9807297d1839fa43b742453e782..c462e2484bda9b0aad55d7bcb1d97a3d8531a78d 100644
--- a/Addons/FmmApi/CMakeLists.txt
+++ b/Addons/FmmApi/CMakeLists.txt
@@ -31,8 +31,8 @@ if(SCALFMM_ADDON_FMMAPI)
 
 	# Adding the entire project dir as an include dir
 	INCLUDE_DIRECTORIES(
-         ${CMAKE_BINARY_DIR}/Src 
-         ${CMAKE_SOURCE_DIR}/Src   
+         ${SCALFMM_BINARY_DIR}/Src 
+         ${SCALFMM_SOURCE_DIR}/Src   
          ${SCALFMM_INCLUDES}
     )
 
@@ -46,7 +46,7 @@ if(SCALFMM_ADDON_FMMAPI)
 	INSTALL( FILES ${hpp_in_dir} DESTINATION include/ScalFmm/FmmApi )
 
 	file( GLOB_RECURSE source_tests_files Tests/*.cpp )
-	INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src )
+	INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src )
 
 	# Then build test files
 	foreach(exec ${source_tests_files}) 
diff --git a/Addons/HMat/CMakeLists.txt b/Addons/HMat/CMakeLists.txt
index 3132500f7f3569d30acced136f94e6d4fa46bc55..ad81efbc94091dfd08cdc34f2918dd70f3c9afac 100644
--- a/Addons/HMat/CMakeLists.txt
+++ b/Addons/HMat/CMakeLists.txt
@@ -43,8 +43,8 @@ if(SCALFMM_ADDON_HMAT)
 
         # Adding the entire project dir as an include dir
         INCLUDE_DIRECTORIES(
-             ${CMAKE_BINARY_DIR}/Src
-             ${CMAKE_SOURCE_DIR}/Src
+             ${SCALFMM_BINARY_DIR}/Src
+             ${SCALFMM_SOURCE_DIR}/Src
              ${SCALFMM_INCLUDES}
         )
 
@@ -65,7 +65,7 @@ if(SCALFMM_ADDON_HMAT)
         install( TARGETS cclusteringlib ARCHIVE DESTINATION lib )
 
         file( GLOB_RECURSE source_tests_files Tests/*.cpp )
-        INCLUDE_DIRECTORIES( ${CMAKE_BINARY_DIR}/Src )
+        INCLUDE_DIRECTORIES( ${SCALFMM_BINARY_DIR}/Src )
 
         # Then build test files
         SET(hmat_list_execs "")
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 442cd53b07df6cef293b921c560828d618a736c2..e5a962af52497796a27cd1d74c88be77996f3133 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -93,6 +93,7 @@ if (MORSE_DISTRIB_DIR OR EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/CMakeModules/morse/
 # OPENMP 4/5 support 
     option( OPENMP_SUPPORT_COMMUTE   "Set to ON to let tasks commute (KSTAR/StarPU compiler only)" OFF )
     option( OPENMP_SUPPORT_PRIORITY  "Set to ON to enable tasks priority (KSTAR/StarPU compiler only)" OFF )
+    option( OPENMP_SUPPORT_TASK_NAME "Set to ON to enable a taskname clause for tasks (KSTAR/StarPU compiler only)" OFF )
     option( SCALFMM_DISABLE_NATIVE_OMP4 "Set to ON to disable the gcc/intel omp4"    OFF )
     option( SCALFMM_TIME_OMPTASKS "Set to ON to time omp4 tasks and generate output file"    OFF )
 # STARPU options
diff --git a/CMakeModules/morse/find/FindFFTW.cmake b/CMakeModules/morse/find/FindFFTW.cmake
index 8a992479577a07cbced3ee471947196915484c82..ac98df8b664267d7f9bb224f215cc28a1b6bb6fb 100644
--- a/CMakeModules/morse/find/FindFFTW.cmake
+++ b/CMakeModules/morse/find/FindFFTW.cmake
@@ -64,10 +64,10 @@
 
 
 if (NOT FFTW_FOUND)
-    set(FFTW_DIR "" CACHE PATH "Installation directory of FFTW library given by user")
-    if (NOT FFTW_FIND_QUIETLY)
-        message(STATUS "A cache variable, namely FFTW_DIR, has been set to specify the install directory of FFTW")
-    endif()
+  set(FFTW_DIR "" CACHE PATH "Installation directory of FFTW library given by user")
+  if (NOT FFTW_FIND_QUIETLY)
+    message(STATUS "A cache variable, namely FFTW_DIR, has been set to specify the install directory of FFTW")
+  endif()
 endif()
 
 # Set the version to find
@@ -80,223 +80,264 @@ set(FFTW_LOOK_FOR_FFTW_LONG OFF)
 set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
 
 if( FFTW_FIND_COMPONENTS )
-    foreach( component ${FFTW_FIND_COMPONENTS} )
-        if (${component} STREQUAL "THREADS")
-            # means we look for the Threads version of FFTW
-            set(FFTW_LOOK_FOR_THREADS ON)
-        endif()
-        if (${component} STREQUAL "OMP")
-            # means we look for the OpenMP version of FFTW
-            set(FFTW_LOOK_FOR_OMP ON)
-        endif()
-        if (${component} STREQUAL "SIMPLE")
-            # means we look for FFTW simple precision (fftw3f)
-            set(FFTW_LOOK_FOR_FFTW_SIMPLE ON)
-            set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_LONG OFF)
-            set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
-        endif()
-        if (${component} STREQUAL "DOUBLE")
-            # means we look for FFTW double precision (fftw3)
-            set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_DOUBLE ON)
-            set(FFTW_LOOK_FOR_FFTW_LONG OFF)
-            set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
-        endif()
-        if (${component} STREQUAL "LONG")
-            # means we look for FFTW long double precision (fftw3l)
-            set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_LONG ON)
-            set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
-        endif()
-        if (${component} STREQUAL "QUAD")
-            # means we look for FFTW quad precision (fftw3q)
-            set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
-            set(FFTW_LOOK_FOR_FFTW_LONG OFF)
-            set(FFTW_LOOK_FOR_FFTW_QUAD ON)
-        endif()
-        if (${component} STREQUAL "MKL")
-            # means we look for the Intel MKL version of FFTW
-            set(FFTW_LOOK_FOR_MKL ON)
-            if (FFTW_LOOK_FOR_FFTW_LONG)
-                message(WARNING "Looking for FFTW -- long precision functions do not exist in MKL FFTW")
-                set(FFTW_LOOK_FOR_FFTW_LONG OFF)
-            endif()
-            if (FFTW_LOOK_FOR_FFTW_QUAD)
-                message(WARNING "Looking for FFTW -- quadruple functions do not exist in MKL FFTW")
-                set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
-            endif()
-        endif()
-    endforeach()
+  foreach( component ${FFTW_FIND_COMPONENTS} )
+    if (${component} STREQUAL "THREADS")
+      # means we look for the Threads version of FFTW
+      set(FFTW_LOOK_FOR_THREADS ON)
+    endif()
+    if (${component} STREQUAL "OMP")
+      # means we look for the OpenMP version of FFTW
+      set(FFTW_LOOK_FOR_OMP ON)
+    endif()
+    if (${component} STREQUAL "SIMPLE")
+      # means we look for FFTW simple precision (fftw3f)
+      set(FFTW_LOOK_FOR_FFTW_SIMPLE ON)
+      set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_LONG OFF)
+      set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
+    endif()
+    if (${component} STREQUAL "DOUBLE")
+      # means we look for FFTW double precision (fftw3)
+      set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_DOUBLE ON)
+      set(FFTW_LOOK_FOR_FFTW_LONG OFF)
+      set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
+    endif()
+    if (${component} STREQUAL "LONG")
+      # means we look for FFTW long double precision (fftw3l)
+      set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_LONG ON)
+      set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
+    endif()
+    if (${component} STREQUAL "QUAD")
+      # means we look for FFTW quad precision (fftw3q)
+      set(FFTW_LOOK_FOR_FFTW_SIMPLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_DOUBLE OFF)
+      set(FFTW_LOOK_FOR_FFTW_LONG OFF)
+      set(FFTW_LOOK_FOR_FFTW_QUAD ON)
+    endif()
+    if (${component} STREQUAL "MKL")
+      # means we look for the Intel MKL version of FFTW
+      set(FFTW_LOOK_FOR_MKL ON)
+      if (FFTW_LOOK_FOR_FFTW_LONG)
+	message(WARNING "Looking for FFTW -- long precision functions do not exist in MKL FFTW")
+	set(FFTW_LOOK_FOR_FFTW_LONG OFF)
+      endif()
+      if (FFTW_LOOK_FOR_FFTW_QUAD)
+	message(WARNING "Looking for FFTW -- quadruple functions do not exist in MKL FFTW")
+	set(FFTW_LOOK_FOR_FFTW_QUAD OFF)
+      endif()
+    endif()
+  endforeach()
 endif()
 
 if (FFTW_LOOK_FOR_THREADS)
-    if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_THREADS)
-        find_package(Threads REQUIRED)
-    else()
-        find_package(Threads)
-    endif()
+  if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_THREADS)
+    find_package(Threads REQUIRED)
+  else()
+    find_package(Threads)
+  endif()
 endif()
 
 if (FFTW_LOOK_FOR_MKL)
-    if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_MKL)
-        find_package(Threads REQUIRED)
-    else()
-        find_package(Threads)
-    endif()
+  if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_MKL)
+    find_package(Threads REQUIRED)
+  else()
+    find_package(Threads)
+  endif()
 endif()
 
 if (FFTW_LOOK_FOR_OMP)
-    if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_OMP)
-        find_package(OpenMP REQUIRED)
-    else()
-        find_package(OpenMP)
-    endif()
+  if (FFTW_FIND_REQUIRED AND FFTW_FIND_REQUIRED_OMP)
+    find_package(OpenMP REQUIRED)
+  else()
+    find_package(OpenMP)
+  endif()
 endif()
 
-# Looking for include
-# -------------------
 
-# Add system include paths to search include
-# ------------------------------------------
-unset(_inc_env)
-set(ENV_MKLROOT "$ENV{MKLROOT}")
 set(ENV_FFTW_DIR "$ENV{FFTW_DIR}")
 set(ENV_FFTW_INCDIR "$ENV{FFTW_INCDIR}")
-if(ENV_FFTW_INCDIR)
+set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}")
+set(FFTW_GIVEN_BY_USER "FALSE")
+if ( FFTW_DIR OR ( FFTW_INCDIR AND FFTW_LIBDIR) OR ENV_FFTW_DIR OR (ENV_FFTW_INCDIR AND ENV_FFTW_LIBDIR) )
+  set(FFTW_GIVEN_BY_USER "TRUE")
+endif()
+
+# Optionally use pkg-config to detect include/library dirs (if pkg-config is available)
+# -------------------------------------------------------------------------------------
+include(FindPkgConfig)
+find_package(PkgConfig QUIET)
+if( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER )
+
+  pkg_search_module(FFTW fftw3)
+  if (NOT FFTW_FIND_QUIETLY)
+    if (FFTW_FOUND AND FFTW_LIBRARIES)
+      message(STATUS "Looking for FFTW - found using PkgConfig")
+      #if(NOT FFTW_INCLUDE_DIRS)
+      #    message("${Magenta}FFTW_INCLUDE_DIRS is empty using PkgConfig."
+      #        "Perhaps the path to hwloc headers is already present in your"
+      #        "C(PLUS)_INCLUDE_PATH environment variable.${ColourReset}")
+      #endif()
+    else()
+      message("${Magenta}Looking for FFTW - not found using PkgConfig."
+	"Perhaps you should add the directory containing fftw3.pc to"
+	"the PKG_CONFIG_PATH environment variable.${ColourReset}")
+    endif()
+  endif()
+
+  set(FFTW_INCLUDE_DIRS_DEP "${FFTW_INCLUDE_DIRS}")
+  set(FFTW_LIBRARY_DIRS_DEP "${FFTW_LIBRARY_DIRS}")
+  set(FFTW_LIBRARIES_DEP "${FFTW_LIBRARIES}")
+  set(FFTW_WORKS TRUE)
+
+endif( PKG_CONFIG_EXECUTABLE AND NOT FFTW_GIVEN_BY_USER )
+
+
+if( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) )
+
+  # Looking for include
+  # -------------------
+
+  # Add system include paths to search include
+  # ------------------------------------------
+  unset(_inc_env)
+  set(ENV_MKLROOT "$ENV{MKLROOT}")
+  set(ENV_FFTW_DIR "$ENV{FFTW_DIR}")
+  set(ENV_FFTW_INCDIR "$ENV{FFTW_INCDIR}")
+  if(ENV_FFTW_INCDIR)
     list(APPEND _inc_env "${ENV_FFTW_INCDIR}")
-elseif(ENV_FFTW_DIR)
+  elseif(ENV_FFTW_DIR)
     list(APPEND _inc_env "${ENV_FFTW_DIR}")
     list(APPEND _inc_env "${ENV_FFTW_DIR}/include")
     list(APPEND _inc_env "${ENV_FFTW_DIR}/include/fftw")
-else()
+  else()
     if (ENV_MKLROOT)
-        list(APPEND _inc_env "${ENV_MKLROOT}/include/fftw")
+      list(APPEND _inc_env "${ENV_MKLROOT}/include/fftw")
     endif()
     # system variables
     if(WIN32)
-        string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
-        list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+      list(APPEND _inc_env "${_path_env}")
     else()
-        string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
-        list(APPEND _inc_env "${_path_env}")
-        string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
-        list(APPEND _inc_env "${_path_env}")
-        string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
-        list(APPEND _inc_env "${_path_env}")
-        string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
-        list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{INCLUDE}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{C_INCLUDE_PATH}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{CPATH}")
+      list(APPEND _inc_env "${_path_env}")
+      string(REPLACE ":" ";" _path_env "$ENV{INCLUDE_PATH}")
+      list(APPEND _inc_env "${_path_env}")
     endif()
-endif()
-list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
-list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
-list(REMOVE_DUPLICATES _inc_env)
-
-# set paths where to look for
-set(PATH_TO_LOOK_FOR "${_inc_env}")
-
-# Try to find the fftw header in the given paths
-# -------------------------------------------------
-# call cmake macro to find the header path
-if(FFTW_INCDIR)
+  endif()
+  list(APPEND _inc_env "${CMAKE_PLATFORM_IMPLICIT_INCLUDE_DIRECTORIES}")
+  list(APPEND _inc_env "${CMAKE_C_IMPLICIT_INCLUDE_DIRECTORIES}")
+  list(REMOVE_DUPLICATES _inc_env)
+
+  # set paths where to look for
+  set(PATH_TO_LOOK_FOR "${_inc_env}")
+
+  # Try to find the fftw header in the given paths
+  # -------------------------------------------------
+  # call cmake macro to find the header path
+  if(FFTW_INCDIR)
     set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND")
     find_path(FFTW_fftw3.h_DIRS
       NAMES fftw3.h
       HINTS ${FFTW_INCDIR})
-else()
+  else()
     if(FFTW_DIR)
-        set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND")
-        find_path(FFTW_fftw3.h_DIRS
-          NAMES fftw3.h
-          HINTS ${FFTW_DIR}
-          PATH_SUFFIXES "include" "include/fftw")
+      set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND")
+      find_path(FFTW_fftw3.h_DIRS
+        NAMES fftw3.h
+        HINTS ${FFTW_DIR}
+        PATH_SUFFIXES "include" "include/fftw")
     else()
-        set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND")
-        find_path(FFTW_fftw3.h_DIRS
-                  NAMES fftw3.h
-                  HINTS ${PATH_TO_LOOK_FOR}
-                  PATH_SUFFIXES "fftw")
+      set(FFTW_fftw3.h_DIRS "FFTW_fftw3.h_DIRS-NOTFOUND")
+      find_path(FFTW_fftw3.h_DIRS
+        NAMES fftw3.h
+        HINTS ${PATH_TO_LOOK_FOR}
+        PATH_SUFFIXES "fftw")
     endif()
-endif()
-mark_as_advanced(FFTW_fftw3.h_DIRS)
+  endif()
+  mark_as_advanced(FFTW_fftw3.h_DIRS)
 
-# Add path to cmake variable
-# ------------------------------------
-if (FFTW_fftw3.h_DIRS)
+  # Add path to cmake variable
+  # ------------------------------------
+  if (FFTW_fftw3.h_DIRS)
     set(FFTW_INCLUDE_DIRS "${FFTW_fftw3.h_DIRS}")
-else ()
+  else ()
     set(FFTW_INCLUDE_DIRS "FFTW_INCLUDE_DIRS-NOTFOUND")
     if(NOT FFTW_FIND_QUIETLY)
-        message(STATUS "Looking for FFTW -- fftw3.h not found")
+      message(STATUS "Looking for FFTW -- fftw3.h not found")
     endif()
-endif ()
+  endif ()
 
 
-# Looking for lib
-# ---------------
+  # Looking for lib
+  # ---------------
 
-# Add system library paths to search lib
-# --------------------------------------
-unset(_lib_env)
-set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}")
-if(ENV_FFTW_LIBDIR)
+  # Add system library paths to search lib
+  # --------------------------------------
+  unset(_lib_env)
+  set(ENV_FFTW_LIBDIR "$ENV{FFTW_LIBDIR}")
+  if(ENV_FFTW_LIBDIR)
     list(APPEND _lib_env "${ENV_FFTW_LIBDIR}")
-elseif(ENV_FFTW_DIR)
+  elseif(ENV_FFTW_DIR)
     list(APPEND _lib_env "${ENV_FFTW_DIR}")
     list(APPEND _lib_env "${ENV_FFTW_DIR}/lib")
     if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-        list(APPEND _lib_env "${ENV_FFTW_DIR}/lib64")
-        list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/intel64")
+      list(APPEND _lib_env "${ENV_FFTW_DIR}/lib64")
+      list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/intel64")
     else()
-        list(APPEND _lib_env "${ENV_FFTW_DIR}/lib32")
-        list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/ia32")
+      list(APPEND _lib_env "${ENV_FFTW_DIR}/lib32")
+      list(APPEND _lib_env "${ENV_FFTW_DIR}/lib/ia32")
     endif()
-else()
+  else()
     if (ENV_MKLROOT)
-        list(APPEND _lib_env "${ENV_MKLROOT}/lib")
-        if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
-            list(APPEND _lib_env "${ENV_MKLROOT}/lib64")
-            list(APPEND _lib_env "${ENV_MKLROOT}/lib/intel64")
-        else()
-            list(APPEND _lib_env "${ENV_MKLROOT}/lib32")
-            list(APPEND _lib_env "${ENV_MKLROOT}/lib/ia32")
-        endif()
+      list(APPEND _lib_env "${ENV_MKLROOT}/lib")
+      if("${CMAKE_SIZEOF_VOID_P}" EQUAL "8")
+	list(APPEND _lib_env "${ENV_MKLROOT}/lib64")
+	list(APPEND _lib_env "${ENV_MKLROOT}/lib/intel64")
+      else()
+	list(APPEND _lib_env "${ENV_MKLROOT}/lib32")
+	list(APPEND _lib_env "${ENV_MKLROOT}/lib/ia32")
+      endif()
     endif()
     if(WIN32)
-        string(REPLACE ":" ";" _lib_env2 "$ENV{LIB}")
+      string(REPLACE ":" ";" _lib_env2 "$ENV{LIB}")
     else()
-        if(APPLE)
-            string(REPLACE ":" ";" _lib_env2 "$ENV{DYLD_LIBRARY_PATH}")
-        else()
-            string(REPLACE ":" ";" _lib_env2 "$ENV{LD_LIBRARY_PATH}")
-        endif()
-        list(APPEND _lib_env "${_lib_env2}")
-        list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
-        list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
+      if(APPLE)
+	string(REPLACE ":" ";" _lib_env2 "$ENV{DYLD_LIBRARY_PATH}")
+      else()
+	string(REPLACE ":" ";" _lib_env2 "$ENV{LD_LIBRARY_PATH}")
+      endif()
+      list(APPEND _lib_env "${_lib_env2}")
+      list(APPEND _lib_env "${CMAKE_PLATFORM_IMPLICIT_LINK_DIRECTORIES}")
+      list(APPEND _lib_env "${CMAKE_C_IMPLICIT_LINK_DIRECTORIES}")
     endif()
-endif()
-list(REMOVE_DUPLICATES _lib_env)
+  endif()
+  list(REMOVE_DUPLICATES _lib_env)
 
-# set paths where to look for
-set(PATH_TO_LOOK_FOR "${_lib_env}")
+  # set paths where to look for
+  set(PATH_TO_LOOK_FOR "${_lib_env}")
 
-if(FFTW_LOOK_FOR_FFTW_SIMPLE)
+  if(FFTW_LOOK_FOR_FFTW_SIMPLE)
     set(FFTW_PREC "f")
     set(FFTW_PREC_TESTFUNC "s")
-elseif(FFTW_LOOK_FOR_FFTW_DOUBLE)
+  elseif(FFTW_LOOK_FOR_FFTW_DOUBLE)
     set(FFTW_PREC "")
     set(FFTW_PREC_TESTFUNC "d")
-elseif(FFTW_LOOK_FOR_FFTW_LONG)
+  elseif(FFTW_LOOK_FOR_FFTW_LONG)
     set(FFTW_PREC "l")
     set(FFTW_PREC_TESTFUNC "l")
-elseif(FFTW_LOOK_FOR_FFTW_QUAD)
+  elseif(FFTW_LOOK_FOR_FFTW_QUAD)
     set(FFTW_PREC "q")
     set(FFTW_PREC_TESTFUNC "q")
-endif()
+  endif()
 
-if (FFTW_LOOK_FOR_MKL)
+  if (FFTW_LOOK_FOR_MKL)
 
     set(FFTW_libs_to_find "mkl_intel_lp64;mkl_sequential;mkl_core")
 
@@ -305,39 +346,39 @@ if (FFTW_LOOK_FOR_MKL)
 
     # call cmake macro to find the lib path
     if(FFTW_LIBDIR)
-        foreach(fftw_lib ${FFTW_libs_to_find})
-            set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-            find_library(FFTW_${fftw_lib}_LIBRARY
-                NAMES ${fftw_lib}
-                HINTS ${FFTW_LIBDIR})
-        endforeach()
+      foreach(fftw_lib ${FFTW_libs_to_find})
+	set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	find_library(FFTW_${fftw_lib}_LIBRARY
+	  NAMES ${fftw_lib}
+	  HINTS ${FFTW_LIBDIR})
+      endforeach()
     else()
-        if(FFTW_DIR)
-            foreach(fftw_lib ${FFTW_libs_to_find})
-                set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-                find_library(FFTW_${fftw_lib}_LIBRARY
-                    NAMES ${fftw_lib}
-                    HINTS ${FFTW_DIR}
-                    PATH_SUFFIXES lib lib32 lib64)
-            endforeach()
-        else()
-            foreach(fftw_lib ${FFTW_libs_to_find})
-                set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-                find_library(FFTW_${fftw_lib}_LIBRARY
-                         NAMES ${fftw_lib}
-                         HINTS ${PATH_TO_LOOK_FOR})
-            endforeach()
-        endif()
+      if(FFTW_DIR)
+	foreach(fftw_lib ${FFTW_libs_to_find})
+	  set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	  find_library(FFTW_${fftw_lib}_LIBRARY
+	    NAMES ${fftw_lib}
+	    HINTS ${FFTW_DIR}
+	    PATH_SUFFIXES lib lib32 lib64)
+	endforeach()
+      else()
+	foreach(fftw_lib ${FFTW_libs_to_find})
+	  set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	  find_library(FFTW_${fftw_lib}_LIBRARY
+	    NAMES ${fftw_lib}
+	    HINTS ${PATH_TO_LOOK_FOR})
+	endforeach()
+      endif()
     endif()
 
-else(FFTW_LOOK_FOR_MKL)
+  else(FFTW_LOOK_FOR_MKL)
 
     if (FFTW_LOOK_FOR_THREADS)
-        set(FFTW_libs_to_find "fftw3${FFTW_PREC}_threads;fftw3${FFTW_PREC};fftw3")
+      set(FFTW_libs_to_find "fftw3${FFTW_PREC}_threads;fftw3${FFTW_PREC};fftw3")
     elseif (FFTW_LOOK_FOR_OMP)
-        set(FFTW_libs_to_find "fftw3${FFTW_PREC}_omp;fftw3${FFTW_PREC};fftw3")
+      set(FFTW_libs_to_find "fftw3${FFTW_PREC}_omp;fftw3${FFTW_PREC};fftw3")
     else()
-        set(FFTW_libs_to_find "fftw3${FFTW_PREC};fftw3")
+      set(FFTW_libs_to_find "fftw3${FFTW_PREC};fftw3")
     endif()
 
     # Try to find the fftw lib in the given paths
@@ -345,59 +386,59 @@ else(FFTW_LOOK_FOR_MKL)
 
     # call cmake macro to find the lib path
     if(FFTW_LIBDIR)
-        foreach(fftw_lib ${FFTW_libs_to_find})
-            set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-            find_library(FFTW_${fftw_lib}_LIBRARY
-                NAMES ${fftw_lib}
-                HINTS ${FFTW_LIBDIR})
-        endforeach()
+      foreach(fftw_lib ${FFTW_libs_to_find})
+	set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	find_library(FFTW_${fftw_lib}_LIBRARY
+	  NAMES ${fftw_lib}
+	  HINTS ${FFTW_LIBDIR})
+      endforeach()
     else()
-        if(FFTW_DIR)
-            foreach(fftw_lib ${FFTW_libs_to_find})
-                set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-                find_library(FFTW_${fftw_lib}_LIBRARY
-                    NAMES ${fftw_lib}
-                    HINTS ${FFTW_DIR}
-                    PATH_SUFFIXES lib lib32 lib64)
-            endforeach()
-        else()
-            foreach(fftw_lib ${FFTW_libs_to_find})
-                set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
-                find_library(FFTW_${fftw_lib}_LIBRARY
-                         NAMES ${fftw_lib}
-                         HINTS ${PATH_TO_LOOK_FOR})
-            endforeach()
-        endif()
+      if(FFTW_DIR)
+	foreach(fftw_lib ${FFTW_libs_to_find})
+	  set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	  find_library(FFTW_${fftw_lib}_LIBRARY
+	    NAMES ${fftw_lib}
+	    HINTS ${FFTW_DIR}
+	    PATH_SUFFIXES lib lib32 lib64)
+	endforeach()
+      else()
+	foreach(fftw_lib ${FFTW_libs_to_find})
+	  set(FFTW_${fftw_lib}_LIBRARY "FFTW_${fftw_lib}_LIBRARY-NOTFOUND")
+	  find_library(FFTW_${fftw_lib}_LIBRARY
+	    NAMES ${fftw_lib}
+	    HINTS ${PATH_TO_LOOK_FOR})
+	endforeach()
+      endif()
     endif()
 
-endif(FFTW_LOOK_FOR_MKL)
+  endif(FFTW_LOOK_FOR_MKL)
 
-# If found, add path to cmake variable
-# ------------------------------------
-set(FFTW_LIBRARIES "")
-set(FFTW_LIBRARY_DIRS "")
-foreach(fftw_lib ${FFTW_libs_to_find})
+  # If found, add path to cmake variable
+  # ------------------------------------
+  set(FFTW_LIBRARIES "")
+  set(FFTW_LIBRARY_DIRS "")
+  foreach(fftw_lib ${FFTW_libs_to_find})
 
     if (FFTW_${fftw_lib}_LIBRARY)
-        get_filename_component(${fftw_lib}_lib_path "${FFTW_${fftw_lib}_LIBRARY}" PATH)
-        # set cmake variables
-        list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}")
-        list(APPEND FFTW_LIBRARY_DIRS "${${fftw_lib}_lib_path}")
+      get_filename_component(${fftw_lib}_lib_path "${FFTW_${fftw_lib}_LIBRARY}" PATH)
+      # set cmake variables
+      list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}")
+      list(APPEND FFTW_LIBRARY_DIRS "${${fftw_lib}_lib_path}")
     else ()
-        list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}")
-        if (NOT FFTW_FIND_QUIETLY)
-            message(STATUS "Looking for FFTW -- lib ${fftw_lib} not found")
-        endif()
+      list(APPEND FFTW_LIBRARIES "${FFTW_${fftw_lib}_LIBRARY}")
+      if (NOT FFTW_FIND_QUIETLY)
+	message(STATUS "Looking for FFTW -- lib ${fftw_lib} not found")
+      endif()
     endif ()
     mark_as_advanced(FFTW_${fftw_lib}_LIBRARY)
 
-endforeach()
+  endforeach()
 
-list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS)
-list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS)
+  list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS)
+  list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS)
 
-# check a function to validate the find
-if(FFTW_LIBRARIES)
+  # check a function to validate the find
+  if(FFTW_LIBRARIES)
 
     set(REQUIRED_FLAGS)
     set(REQUIRED_LDFLAGS)
@@ -407,39 +448,39 @@ if(FFTW_LIBRARIES)
 
     # FFTW
     if (FFTW_INCLUDE_DIRS)
-        set(REQUIRED_INCDIRS "${FFTW_INCLUDE_DIRS}")
+      set(REQUIRED_INCDIRS "${FFTW_INCLUDE_DIRS}")
     endif()
     if (FFTW_LIBRARY_DIRS)
-        set(REQUIRED_LIBDIRS "${FFTW_LIBRARY_DIRS}")
+      set(REQUIRED_LIBDIRS "${FFTW_LIBRARY_DIRS}")
     endif()
     set(REQUIRED_LIBS "${FFTW_LIBRARIES}")
     # THREADS
     if (FFTW_LOOK_FOR_THREADS)
-        list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
+      list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
     endif()
     # OMP
     if(FFTW_LOOK_FOR_OMP)
-        if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
-            # either gomp ...
-            #set(REQUIRED_FLAGS "-fopenmp")
-            #list(APPEND REQUIRED_LIBS "-lgomp")
-            # or iomp5
-            list(APPEND REQUIRED_LIBS "-liomp5")
-        elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel")
-            list(APPEND REQUIRED_LIBS "-liomp5")
-        endif()
+      if (CMAKE_C_COMPILER_ID STREQUAL "GNU")
+	# either gomp ...
+	#set(REQUIRED_FLAGS "-fopenmp")
+	#list(APPEND REQUIRED_LIBS "-lgomp")
+	# or iomp5
+	list(APPEND REQUIRED_LIBS "-liomp5")
+      elseif (CMAKE_C_COMPILER_ID STREQUAL "Intel")
+	list(APPEND REQUIRED_LIBS "-liomp5")
+      endif()
     endif()
     # MKL
     if(FFTW_LOOK_FOR_MKL)
-        list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
-        if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
-            list(APPEND REQUIRED_LDFLAGS "-Wl,--no-as-needed")
-        endif()
+      list(APPEND REQUIRED_LIBS "${CMAKE_THREAD_LIBS_INIT}")
+      if (CMAKE_C_COMPILER_ID STREQUAL "GNU" AND CMAKE_SYSTEM_NAME STREQUAL "Linux")
+	list(APPEND REQUIRED_LDFLAGS "-Wl,--no-as-needed")
+      endif()
     endif()
     # m
     find_library(M_LIBRARY NAMES m)
     if(M_LIBRARY)
-        list(APPEND REQUIRED_LIBS "-lm")
+      list(APPEND REQUIRED_LIBS "-lm")
     endif()
 
     # set required libraries for link
@@ -447,7 +488,7 @@ if(FFTW_LIBRARIES)
     set(CMAKE_REQUIRED_LIBRARIES)
     list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LDFLAGS}")
     foreach(lib_dir ${REQUIRED_LIBDIRS})
-        list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
+      list(APPEND CMAKE_REQUIRED_LIBRARIES "-L${lib_dir}")
     endforeach()
     list(APPEND CMAKE_REQUIRED_LIBRARIES "${REQUIRED_LIBS}")
     list(APPEND CMAKE_REQUIRED_FLAGS "${REQUIRED_FLAGS}")
@@ -460,46 +501,48 @@ if(FFTW_LIBRARIES)
     mark_as_advanced(FFTW_WORKS)
 
     if(FFTW_WORKS)
-        # save link with dependencies
-        set(FFTW_LIBRARIES_DEP "${REQUIRED_LIBS}")
-        set(FFTW_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}")
-        set(FFTW_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}")
-        set(FFTW_C_FLAGS "${REQUIRED_FLAGS}")
-        set(FFTW_LINKER_FLAGS "${REQUIRED_LDFLAGS}")
-        list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS_DEP)
-        list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS_DEP)
-        list(REMOVE_DUPLICATES FFTW_LINKER_FLAGS)
+      # save link with dependencies
+      set(FFTW_LIBRARIES_DEP "${REQUIRED_LIBS}")
+      set(FFTW_LIBRARY_DIRS_DEP "${REQUIRED_LIBDIRS}")
+      set(FFTW_INCLUDE_DIRS_DEP "${REQUIRED_INCDIRS}")
+      set(FFTW_C_FLAGS "${REQUIRED_FLAGS}")
+      set(FFTW_LINKER_FLAGS "${REQUIRED_LDFLAGS}")
+      list(REMOVE_DUPLICATES FFTW_LIBRARY_DIRS_DEP)
+      list(REMOVE_DUPLICATES FFTW_INCLUDE_DIRS_DEP)
+      list(REMOVE_DUPLICATES FFTW_LINKER_FLAGS)
     else()
-        if(NOT FFTW_FIND_QUIETLY)
-            message(STATUS "Looking for FFTW : test of ${FFTW_PREC_TESTFUNC}fftw_execute_ with fftw library fails")
-            message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
-            message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
-            message(STATUS "CMAKE_REQUIRED_FLAGS: ${CMAKE_REQUIRED_FLAGS}")
-            message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
-        endif()
+      if(NOT FFTW_FIND_QUIETLY)
+	message(STATUS "Looking for FFTW : test of ${FFTW_PREC_TESTFUNC}fftw_execute_ with fftw library fails")
+	message(STATUS "CMAKE_REQUIRED_LIBRARIES: ${CMAKE_REQUIRED_LIBRARIES}")
+	message(STATUS "CMAKE_REQUIRED_INCLUDES: ${CMAKE_REQUIRED_INCLUDES}")
+	message(STATUS "CMAKE_REQUIRED_FLAGS: ${CMAKE_REQUIRED_FLAGS}")
+	message(STATUS "Check in CMakeFiles/CMakeError.log to figure out why it fails")
+      endif()
     else()
-        set(FFTW_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
+      set(FFTW_LIBRARIES ${CMAKE_REQUIRED_LIBRARIES})
     endif()
     set(CMAKE_REQUIRED_INCLUDES)
     set(CMAKE_REQUIRED_FLAGS)
     set(CMAKE_REQUIRED_LIBRARIES)
-endif(FFTW_LIBRARIES)
+  endif(FFTW_LIBRARIES)
+
+endif( (NOT PKG_CONFIG_EXECUTABLE) OR (PKG_CONFIG_EXECUTABLE AND NOT FFTW_FOUND) OR (FFTW_GIVEN_BY_USER) )
 
 if (FFTW_LIBRARIES)
-    list(GET FFTW_LIBRARIES 0 first_lib)
-    get_filename_component(first_lib_path "${first_lib}" PATH)
-    if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)")
-        string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}")
-        set(FFTW_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of FFTW library" FORCE)
-    else()
-        set(FFTW_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of FFTW library" FORCE)
-    endif()
+  list(GET FFTW_LIBRARIES 0 first_lib)
+  get_filename_component(first_lib_path "${first_lib}" PATH)
+  if (${first_lib_path} MATCHES "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)")
+    string(REGEX REPLACE "(/lib(32|64)?$)|(/lib/intel64$|/lib/ia32$)" "" not_cached_dir "${first_lib_path}")
+    set(FFTW_DIR_FOUND "${not_cached_dir}" CACHE PATH "Installation directory of FFTW library" FORCE)
+  else()
+    set(FFTW_DIR_FOUND "${first_lib_path}" CACHE PATH "Installation directory of FFTW library" FORCE)
+  endif()
 endif()
 
 # check that FFTW has been found
 # -------------------------------
 include(FindPackageHandleStandardArgs)
 find_package_handle_standard_args(FFTW DEFAULT_MSG
-                                  FFTW_LIBRARIES
-                                  FFTW_INCLUDE_DIRS
-                                  FFTW_WORKS)
+  FFTW_LIBRARIES
+  FFTW_INCLUDE_DIRS
+  FFTW_WORKS)
diff --git a/Examples/CMakeLists.txt b/Examples/CMakeLists.txt
index 0add5c5da64a93e2a85e237f83771d8b81022fb0..c01af575183406afbea9f53920d318a925ccf664 100644
--- a/Examples/CMakeLists.txt
+++ b/Examples/CMakeLists.txt
@@ -17,8 +17,8 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Src 
-     ${CMAKE_SOURCE_DIR}/Src 
+     ${SCALFMM_BINARY_DIR}/Src 
+     ${SCALFMM_SOURCE_DIR}/Src 
      ${SCALFMM_INCLUDES}
      
 )
diff --git a/Src/CMakeLists.txt b/Src/CMakeLists.txt
index b71293cd8afe9c0ec1b69fcd4f36dcd4c195c5fc..531c3f2d90f239eaffd4cfd1ff05e96b21f43987 100644
--- a/Src/CMakeLists.txt
+++ b/Src/CMakeLists.txt
@@ -109,5 +109,5 @@ FOREACH(my_dir ${my_include_dirs})
 	INSTALL( FILES ${hpp_in_dir} DESTINATION include/${my_dir} )
 ENDFOREACH()
 
-INSTALL( FILES "${CMAKE_BINARY_DIR}/Src/ScalFmmConfig.h" DESTINATION include/Utils/${my_dir} )
+INSTALL( FILES "${SCALFMM_BINARY_DIR}/Src/ScalFmmConfig.h" DESTINATION include/Utils/${my_dir} )
 
diff --git a/Src/Containers/FMpiBufferReader.hpp b/Src/Containers/FMpiBufferReader.hpp
index 95d9ac2e8387972b37b2c3b90d457f53dcae5896..98f351b2a1190855c87b060336aad67dc2a4d74d 100644
--- a/Src/Containers/FMpiBufferReader.hpp
+++ b/Src/Containers/FMpiBufferReader.hpp
@@ -21,34 +21,27 @@
 #include "FAbstractBuffer.hpp"
 #include "../Utils/FAssert.hpp"
 
-/** @author Cyrille Piacibello
- * This class provide the same features as FBufferWriter using MPI_Pack system
+/** @author Cyrille Piacibello, Berenger Bramas
+ * This class provide the same features as FBufferWriter
  *
  * Put some data
  * then insert back if needed
  * finally use data pointer as you like
  */
 class FMpiBufferReader : public FAbstractBufferReader {
-    MPI_Comm comm;            //< Communicator needed by MPI_Pack functions
     FSize arrayCapacity;        //< Allocated space
     std::unique_ptr<char[]> array;  //< Allocated Array
     FSize currentIndex;
 
 public :
     /*Constructor with a default arrayCapacity of 512 bytes */
-    explicit FMpiBufferReader(const MPI_Comm inComm = MPI_COMM_WORLD, const FSize inDefaultCapacity = 512):
-        comm(inComm),
+    explicit FMpiBufferReader(const FSize inDefaultCapacity = 512):
         arrayCapacity(inDefaultCapacity),
         array(new char[inDefaultCapacity]),
         currentIndex(0){
         FAssertLF(array, "Cannot allocate array");
     }
 
-    /** Change the comm (or to set it later) */
-    void setComm(const MPI_Comm inComm){
-        comm = inComm;
-    }
-
     /** To change the capacity (but reset the head to 0) */
     void cleanAndResize(const FSize newCapacity){
         if(newCapacity != arrayCapacity){
@@ -97,50 +90,34 @@ public :
     /** Get a value with memory cast */
     template <class ClassType>
     ClassType getValue(){
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(currentIndex < std::numeric_limits<int>::max());
-        int previousIndex = int(currentIndex);
+        FAssertLF(currentIndex + FSize(sizeof(ClassType)) <= arrayCapacity );
         ClassType value;
-        FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,&value,FMpi::GetTypeCount(value),FMpi::GetType(value),comm), __LINE__);
-        seek(FSize(sizeof(value) + currentIndex));
-        FAssertLF(previousIndex == currentIndex);
+        memcpy(&value, &array[currentIndex], sizeof(ClassType));
+        currentIndex += sizeof(ClassType);
         return value;
     }
 
     /** Get a value with memory cast at a specified index */
     template <class ClassType>
     ClassType getValue(const FSize ind){
-        ClassType value;
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(ind < std::numeric_limits<int>::max());
-        int previousIndex = int(ind);
-        FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,&value,FMpi::GetTypeCount(value),FMpi::GetType(value),comm), __LINE__);
-        seek(FSize(sizeof(value)+ind));
-        FAssertLF(previousIndex == currentIndex);
-        return value;
+        currentIndex = ind;
+        return getValue<ClassType>();
     }
 
     /** Fill a value with memory cast */
     template <class ClassType>
     void fillValue(ClassType* const inValue){
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(currentIndex < std::numeric_limits<int>::max());
-        int previousIndex = int(currentIndex);
-        FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,inValue,FMpi::GetTypeCount(*inValue),FMpi::GetType(*inValue),comm), __LINE__);
-        seek(FSize(sizeof(ClassType) + currentIndex));
-        FAssertLF(previousIndex == currentIndex);
+        FAssertLF(currentIndex + FSize(sizeof(ClassType)) <= arrayCapacity );
+        memcpy(inValue, &array[currentIndex], sizeof(ClassType));
+        currentIndex += sizeof(ClassType);
     }
 
     /** Fill one/many value(s) with memcpy */
     template <class ClassType>
     void fillArray(ClassType* const inArray, const FSize inSize){
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(currentIndex < std::numeric_limits<int>::max());
-        FAssertLF(inSize < std::numeric_limits<int>::max());
-        int previousIndex = int(currentIndex);
-        FMpi::Assert(MPI_Unpack(array.get(),int(arrayCapacity),&previousIndex,inArray,int(inSize)*FMpi::GetTypeCount(*inArray),FMpi::GetType(*inArray),comm), __LINE__);
-        seek(FSize(sizeof(ClassType) * inSize + currentIndex));
-        FAssertLF(previousIndex == currentIndex);
+        FAssertLF(currentIndex + FSize(sizeof(ClassType))*inSize <= arrayCapacity );
+        memcpy(inArray, &array[currentIndex], sizeof(ClassType)*inSize);
+        currentIndex += sizeof(ClassType)*inSize;
     }
 
     /** Same as fillValue */
diff --git a/Src/Containers/FMpiBufferWriter.hpp b/Src/Containers/FMpiBufferWriter.hpp
index c2a9942f6532e1821f4d9b62fbefc7d6840d8211..5050d1f3df4f2d1461afc886ac559a37050a1a6d 100644
--- a/Src/Containers/FMpiBufferWriter.hpp
+++ b/Src/Containers/FMpiBufferWriter.hpp
@@ -21,22 +21,21 @@
 #include "FAbstractBuffer.hpp"
 #include "../Utils/FAssert.hpp"
 
-/** @author Cyrille Piacibello
- * This class provide the same features as FBufferWriter using MPI_Pack system
+/** @author Cyrille Piacibello, Berenger Bramas
+ * This class provide the same features as FBufferWriter
  *
  * Put some data
  * then insert back if needed
  * finally use data pointer as you like
  */
 class FMpiBufferWriter : public FAbstractBufferWriter {
-    MPI_Comm mpiComm;         //< Communicator needed by MPI_Pack functions
     FSize arrayCapacity;              //< Allocated Space
     std::unique_ptr<char[]> array;  //< Allocated Array
     FSize currentIndex;               //< Currently filled space
 
     /** Test and exit if not enought space */
-    void expandIfNeeded(const size_t requestedSpace) {
-        if( arrayCapacity < FSize(currentIndex + requestedSpace) ){
+    void expandIfNeeded(const FSize requestedSpace) {
+        if( arrayCapacity < currentIndex + requestedSpace){
             arrayCapacity = FSize(double(currentIndex + requestedSpace + 1) * 1.5);
             char* arrayTmp = new char[arrayCapacity];
             memcpy(arrayTmp, array.get(), sizeof(char)*currentIndex);
@@ -46,19 +45,13 @@ class FMpiBufferWriter : public FAbstractBufferWriter {
 
 public:
     /** Constructor with a default arrayCapacity of 512 bytes */
-    explicit FMpiBufferWriter(const MPI_Comm inComm, const FSize inDefaultCapacity = 1024):
-        mpiComm(inComm),
+    explicit FMpiBufferWriter(const FSize inDefaultCapacity = 1024):
         arrayCapacity(inDefaultCapacity),
         array(new char[inDefaultCapacity]),
         currentIndex(0)
     {}
 
 
-    /** Change the comm (or to set it later) */
-    void setComm(const MPI_Comm inComm){
-        mpiComm = inComm;
-    }
-
     /** To change the capacity (but reset the head to 0 if size if lower) */
     void resize(const FSize newCapacity){
         if(newCapacity != arrayCapacity){
@@ -98,10 +91,8 @@ public:
     template <class ClassType>
     void write(const ClassType& object){
         expandIfNeeded(sizeof(ClassType));
-        FAssertLF(currentIndex < std::numeric_limits<int>::max());
-        int intCurrentIndex = int(currentIndex);
-        FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__);
-        currentIndex = intCurrentIndex;
+        memcpy(&array[currentIndex], &object, sizeof(ClassType));
+        currentIndex += sizeof(ClassType);
     }
 
     /**
@@ -110,20 +101,15 @@ public:
     template <class ClassType>
     void write(const ClassType&& object){
         expandIfNeeded(sizeof(ClassType));
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        int intCurrentIndex = int(currentIndex);
-        FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__);
-        currentIndex = intCurrentIndex;
+        memcpy(&array[currentIndex], &object, sizeof(ClassType));
+        currentIndex += sizeof(ClassType);
     }
 
     /** Write back, position + sizeof(object) has to be < size */
     template <class ClassType>
     void writeAt(const FSize position, const ClassType& object){
-        FAssertLF(FSize(position + sizeof(ClassType)) <= currentIndex);
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(position < std::numeric_limits<int>::max());
-        int noConstPosition = int(position);
-        FMpi::Assert(MPI_Pack(const_cast<ClassType*>(&object), FMpi::GetTypeCount(object), FMpi::GetType(object), array.get(), int(arrayCapacity), &noConstPosition, mpiComm), __LINE__);
+        FAssertLF(position+FSize(sizeof(ClassType)) <= currentIndex);
+        memcpy(&array[position], &object, sizeof(ClassType));
     }
 
     /** Write an array
@@ -132,11 +118,8 @@ public:
     template <class ClassType>
     void write(const ClassType* const objects, const FSize inSize){
         expandIfNeeded(sizeof(ClassType) * inSize);
-        FAssertLF(arrayCapacity < std::numeric_limits<int>::max());
-        FAssertLF(inSize < std::numeric_limits<int>::max());
-        int intCurrentIndex = int(currentIndex);
-        FMpi::Assert(MPI_Pack( const_cast<ClassType*>(objects), int(inSize)*FMpi::GetTypeCount(*objects), FMpi::GetType(*objects), array.get(), int(arrayCapacity), &intCurrentIndex, mpiComm), __LINE__);
-        currentIndex = intCurrentIndex;
+        memcpy(&array[currentIndex], objects, sizeof(ClassType)*inSize);
+        currentIndex += sizeof(ClassType)*inSize;
     }
 
     /** Equivalent to write */
diff --git a/Src/Core/FFmmAlgorithmThreadProc.hpp b/Src/Core/FFmmAlgorithmThreadProc.hpp
index 14382972d98d0ba9e2910bd0b57be8694f373508..05b4a37bda4415d13bfe136edda8056f1c2874fb 100644
--- a/Src/Core/FFmmAlgorithmThreadProc.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProc.hpp
@@ -363,6 +363,7 @@ protected:
         FLOG(computationCounter.tac());
         FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = "  << counterTime.tacAndElapsed() << " s)\n" );
         FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
     /////////////////////////////////////////////////////////////////////////////
@@ -400,7 +401,7 @@ protected:
         MPI_Status statusSize[8];
 
         FSize bufferSize;
-        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
+        FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7
         std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
         FSize recvBufferSize[7];
         CellClass recvBufferCells[7];
@@ -491,7 +492,7 @@ protected:
                             MPI_Isend(&bufferSize, 1, FMpi::GetType(bufferSize), currentProcIdToSendTo,
                                       FMpi::TagFmmM2MSize + idxLevel, comm.getComm(), &requestsSize[iterMpiRequestsSize++]);
                             FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
-                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, currentProcIdToSendTo,
+                            MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, currentProcIdToSendTo,
                                       FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                         }
                     }
@@ -532,7 +533,7 @@ protected:
                             if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
                                 recvBuffer[nbProcThatSendToMe].cleanAndResize(recvBufferSize[nbProcThatSendToMe]);
                                 FAssertLF(recvBufferSize[nbProcThatSendToMe] < std::numeric_limits<int>::max());
-                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_PACKED,
+                                MPI_Irecv(recvBuffer[nbProcThatSendToMe].data(), int(recvBufferSize[nbProcThatSendToMe]), MPI_BYTE,
                                         idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
                                 nbProcThatSendToMe += 1;
                                 FAssertLF(nbProcThatSendToMe <= 7);
@@ -556,7 +557,7 @@ protected:
 
                         // Retreive data and merge my child and the child from others
                         for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
-                            int packageFlags = int(recvBuffer[idxProc].getValue<char>());
+                            unsigned packageFlags = unsigned(recvBuffer[idxProc].getValue<unsigned char>());
 
                             int position = 0;
                             int positionToInsert = 0;
@@ -602,6 +603,7 @@ protected:
         FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
         FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
         FLOG( FLog::Controller << "\t\t Parallel : " << parallelCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
     /////////////////////////////////////////////////////////////////////////////
@@ -754,15 +756,14 @@ protected:
                 FLOG(sendCounter.tic());
                 // Then they can send and receive (because they know what they will receive)
                 // To send in asynchrone way
-                MPI_Request*const requests = new MPI_Request[2 * nbProcess * OctreeHeight];
-                MPI_Status*const status = new MPI_Status[2 * nbProcess * OctreeHeight];
-                int iterRequest = 0;
+                std::vector<MPI_Request> requests;
+                requests.reserve(2 * nbProcess * OctreeHeight);
 
                 for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
                     for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                         const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
                         if(toSendAtProcAtLevel != 0){
-                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));
+                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel);
 
                             sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));
 
@@ -776,20 +777,18 @@ protected:
 
                             FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() == toSendAtProcAtLevel);
 
-                            FAssertLF(sendBuffer[idxLevel * nbProcess + idxProc]->getSize() < std::numeric_limits<int>::max());
-                            FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
-                                             int(sendBuffer[idxLevel * nbProcess + idxProc]->getSize()),MPI_PACKED, idxProc,
-                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
+                            FMpi::ISendSplit(sendBuffer[idxLevel * nbProcess + idxProc]->data(),
+                                    sendBuffer[idxLevel * nbProcess + idxProc]->getSize(), idxProc,
+                                    FMpi::TagLast + idxLevel*100, comm, &requests);
                         }
 
                         const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                         if(toReceiveFromProcAtLevel){
-                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));
+                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel);
 
-                            FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
-                            FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
-                                             int(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity()), MPI_PACKED,idxProc,
-                                    FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
+                            FMpi::IRecvSplit(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
+                                    recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), idxProc,
+                                    FMpi::TagLast + idxLevel*100, comm, &requests);
                         }
                     }
                 }
@@ -799,10 +798,7 @@ protected:
                 //////////////////////////////////////////////////////////////////
 
                 // Wait to receive every things (and send every things)
-                FMpi::MpiAssert(MPI_Waitall(iterRequest, requests, status), __LINE__);
-
-                delete[] requests;
-                delete[] status;
+                FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUS_IGNORE), __LINE__);
 
                 FLOG(sendCounter.tac());
             }//End of Master region
@@ -1009,6 +1005,7 @@ protected:
         FLOG( FLog::Controller << "\t\t Receive : " << receiveCounter.cumulated() << " s\n" );
         FLOG( FLog::Controller << "\t\t Gather : " << gatherCounter.cumulated() << " s\n" );
         FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
 
     }
 
@@ -1039,8 +1036,8 @@ protected:
 
         const int heightMinusOne = FAbstractAlgorithm::lowerWorkingLevel - 1;
 
-        FMpiBufferWriter sendBuffer(comm.getComm());
-        FMpiBufferReader recvBuffer(comm.getComm());
+        FMpiBufferWriter sendBuffer;
+        FMpiBufferReader recvBuffer;
 
         int righestProcToSendTo   = nbProcess - 1;
 
@@ -1116,7 +1113,7 @@ protected:
                                 FMpi::MpiAssert( MPI_Isend(&sendBufferSize, 1, FMpi::GetType(sendBufferSize), idxProcSend,
                                                            FMpi::TagFmmL2LSize + idxLevel, comm.getComm(), &requestsSize[iterRequestsSize++]), __LINE__);
                                 FAssertLF(sendBuffer.getSize() < std::numeric_limits<int>::max());
-                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_PACKED, idxProcSend,
+                                FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), int(sendBuffer.getSize()), MPI_BYTE, idxProcSend,
                                                            FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
                                 // Inc and check the counter
                                 nbMessageSent += 1;
@@ -1139,7 +1136,7 @@ protected:
                     if(hasToReceive){
                         recvBuffer.cleanAndResize(recvBufferSize);
                         FAssertLF(recvBuffer.getCapacity() < std::numeric_limits<int>::max());
-                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_PACKED, idxProcToReceive,
+                        FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), int(recvBuffer.getCapacity()), MPI_BYTE, idxProcToReceive,
                                                     FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
                     }
 
@@ -1184,6 +1181,7 @@ protected:
         FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
         FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
         FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
+        FLOG( FLog::Controller.flush());
     }
 
 
@@ -1213,25 +1211,6 @@ protected:
         ///////////////////////////////////////////////////
         FLOG(prepareCounter.tic());
 
-        // To send in asynchrone way
-        MPI_Request requests[2 * nbProcess];
-        MPI_Status status[2 * nbProcess];
-        int iterRequest = 0;
-        int nbMessagesToRecv = 0;
-
-        FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
-        memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);
-
-        FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
-        memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);
-
-        /* This a nbProcess x nbProcess matrix of integer
-     * let U and V be id of processes :
-     * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
-     */
-        FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
-        memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
-
         FBoolArray leafsNeedOther(this->numberOfLeafs);
         int countNeedOther = 0;
 
@@ -1320,28 +1299,43 @@ protected:
 
 #pragma omp master // nowait
             if(p2pEnabled){
+                /* This a nbProcess x nbProcess matrix of integer
+             * let U and V be id of processes :
+             * globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
+             */
+                FSize*const globalReceiveMap = new FSize[nbProcess * nbProcess];
+                memset(globalReceiveMap, 0, sizeof(FSize) * nbProcess * nbProcess);
+
                 //Share to all processus globalReceiveMap
                 FLOG(gatherCounter.tic());
                 FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, FMpi::GetType(*partsToSend),
                                                 globalReceiveMap, nbProcess, FMpi::GetType(*partsToSend), comm.getComm()),  __LINE__ );
                 FLOG(gatherCounter.tac());
 
+                FMpiBufferReader**const recvBuffer = new FMpiBufferReader*[nbProcess];
+                memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess);
+
+                FMpiBufferWriter**const sendBuffer = new FMpiBufferWriter*[nbProcess];
+                memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess);
+
+                // To send in asynchrone way
+                std::vector<MPI_Request> requests;
+                requests.reserve(2 * nbProcess);
                 //Prepare receive
                 for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                     if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
                         //allocate buffer of right size
-                        recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]);
-                        FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max());
-                        FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED,
-                                                   idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
+                        recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]);
+
+                        FMpi::IRecvSplit(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getCapacity(),
+                                         idxProc, FMpi::TagFmmP2P, comm, &requests);
                     }
                 }
 
-                nbMessagesToRecv = iterRequest;
                 // Prepare send
                 for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                     if(toSend[idxProc].getSize() != 0){
-                        sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]);
+                        sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]);
                         // << is equivalent to write().
                         (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
                         for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
@@ -1350,9 +1344,9 @@ protected:
                         }
 
                         FAssertLF(sendBuffer[idxProc]->getSize() == globalReceiveMap[idProcess*nbProcess+idxProc]);
-                        FAssertLF(sendBuffer[idxProc]->getSize() < std::numeric_limits<int>::max());
-                        FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), int(sendBuffer[idxProc]->getSize()) , MPI_PACKED ,
-                                                    idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
+
+                        FMpi::ISendSplit(sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize(),
+                                         idxProc, FMpi::TagFmmP2P, comm, &requests);
 
                     }
                 }
@@ -1364,23 +1358,34 @@ protected:
                 // Waitsend receive
                 //////////////////////////////////////////////////////////
 
+                std::unique_ptr<MPI_Status[]> status(new MPI_Status[requests.size()]);
                 // Wait data
                 FLOG(waitCounter.tic());
-                MPI_Waitall(iterRequest, requests, status);
+                MPI_Waitall(int(requests.size()), requests.data(), status.get());
                 FLOG(waitCounter.tac());
 
-                for(int idxRcv = 0 ; idxRcv < nbMessagesToRecv ; ++idxRcv){
-                    const int idxProc = status[idxRcv].MPI_SOURCE;
-                    FSize nbLeaves;
-                    (*recvBuffer[idxProc]) >> nbLeaves;
-                    for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
-                        MortonIndex leafIndex;
-                        (*recvBuffer[idxProc]) >> leafIndex;
-                        otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore((*recvBuffer[idxProc]));
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
+                        FAssertLF(recvBuffer[idxProc]);
+                        FMpiBufferReader& currentBuffer = (*recvBuffer[idxProc]);
+                        FSize nbLeaves;
+                        currentBuffer >> nbLeaves;
+                        for(FSize idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
+                            MortonIndex leafIndex;
+                            currentBuffer >> leafIndex;
+                            otherP2Ptree.createLeaf(leafIndex)->getSrc()->restore(currentBuffer);
+                        }
+                        // Realease memory early
+                        delete recvBuffer[idxProc];
+                        recvBuffer[idxProc] = nullptr;
                     }
+                }                
+
+                for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
+                    delete sendBuffer[idxProc];
                     delete recvBuffer[idxProc];
-                    recvBuffer[idxProc] = nullptr;
                 }
+                delete[] globalReceiveMap;
             }
 
             ///////////////////////////////////////////////////
@@ -1530,11 +1535,6 @@ protected:
             }
         }
 
-        for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
-            delete sendBuffer[idxProc];
-            delete recvBuffer[idxProc];
-        }
-        delete[] globalReceiveMap;
         delete[] leafsDataArray;
 
         FLOG(computation2Counter.tac());
@@ -1546,6 +1546,7 @@ protected:
         FLOG( FLog::Controller << "\t\t Prepare P2P : " << prepareCounter.elapsed() << " s\n" );
         FLOG( FLog::Controller << "\t\t Gather P2P : " << gatherCounter.elapsed() << " s\n" );
         FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.elapsed() << " s\n" );
+        FLOG( FLog::Controller.flush());
 
     }
 };
diff --git a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
index 947086936ce5902577558b87a8e24b9c2928458b..b39eed9d04cf539e61581a9351397609644d5273 100644
--- a/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
+++ b/Src/Core/FFmmAlgorithmThreadProcPeriodic.hpp
@@ -404,7 +404,7 @@ protected:
         MPI_Status statusSize[8];
 
         FSize bufferSize;
-        FMpiBufferWriter sendBuffer(comm.getComm(), 1);// Max = 1 + sizeof(cell)*7
+        FMpiBufferWriter sendBuffer(1);// Max = 1 + sizeof(cell)*7
         std::unique_ptr<FMpiBufferReader[]> recvBuffer(new FMpiBufferReader[7]);
         FSize recvBufferSize[7];
         CellClass recvBufferCells[7];
@@ -856,7 +856,7 @@ protected:
                     for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                         const long long int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
                         if(toSendAtProcAtLevel != 0){
-                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),int(toSendAtProcAtLevel));
+                            sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(toSendAtProcAtLevel);
 
                             sendBuffer[idxLevel * nbProcess + idxProc]->write(int(toSend[idxLevel * nbProcess + idxProc].getSize()));
 
@@ -878,7 +878,7 @@ protected:
 
                         const long long int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
                         if(toReceiveFromProcAtLevel){
-                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),int(toReceiveFromProcAtLevel));
+                            recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(toReceiveFromProcAtLevel);
 
                             FAssertLF(recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity() < std::numeric_limits<int>::max());
                             FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
@@ -1126,7 +1126,7 @@ protected:
         MPI_Status*const statusSize = new MPI_Status[8];
 
         FMpiBufferWriter sendBuffer(comm.getComm());
-        FMpiBufferReader recvBuffer(comm.getComm());
+        FMpiBufferReader recvBuffer;
 
         int righestProcToSendTo   = nbProcess - 1;
 
@@ -1441,7 +1441,7 @@ protected:
                 for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                     if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
                         //allocate buffer of right size
-                        recvBuffer[idxProc] = new FMpiBufferReader(comm.getComm(),globalReceiveMap[idxProc * nbProcess + idProcess]);
+                        recvBuffer[idxProc] = new FMpiBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]);
                         FAssertLF(recvBuffer[idxProc]->getCapacity() < std::numeric_limits<int>::max());
                         FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), int(recvBuffer[idxProc]->getCapacity()), MPI_PACKED,
                                                    idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
@@ -1452,7 +1452,7 @@ protected:
                 // Prepare send
                 for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
                     if(toSend[idxProc].getSize() != 0){
-                        sendBuffer[idxProc] = new FMpiBufferWriter(comm.getComm(),globalReceiveMap[idProcess*nbProcess+idxProc]);
+                        sendBuffer[idxProc] = new FMpiBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]);
                         // << is equivalent to write().
                         (*sendBuffer[idxProc]) << toSend[idxProc].getSize();
                         for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
diff --git a/Src/Files/FMpiTreeBuilder.hpp b/Src/Files/FMpiTreeBuilder.hpp
index 89a57482e2eb02e01dc5f50b16e8f19258a23785..87b5ecec6def1dee1acf607d7f31a18cee3d4cd0 100644
--- a/Src/Files/FMpiTreeBuilder.hpp
+++ b/Src/Files/FMpiTreeBuilder.hpp
@@ -20,6 +20,7 @@
 #include "../Utils/FQuickSortMpi.hpp"
 #include "../Utils/FBitonicSort.hpp"
 #include "../Utils/FTic.hpp"
+#include "../Utils/FEnv.hpp"
 
 #include "../Utils/FMemUtils.hpp"
 
@@ -41,6 +42,8 @@
 template<class FReal, class ParticleClass>
 class FMpiTreeBuilder{
 private:
+    static const bool VerboseLog;
+
     /** To keep the leaves information after the sort */
     struct LeafInfo {
         MortonIndex mindex;
@@ -188,13 +191,15 @@ public:
             if( (*workingSize) != 0 ){
                 borderLeavesState[0] = leavesInfo[0];
                 borderLeavesState[1] = leavesInfo[leavesInfo.getSize()-1];
+                FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] First " << borderLeavesState[0].mindex << "\n"; FLog::Controller.flush(); );
+                FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Last " << borderLeavesState[1].mindex << "\n"; FLog::Controller.flush(); );
             }
 
             std::unique_ptr<LeafInfo[]> allProcFirstLeafStates(new LeafInfo[nbProcs*2]);
             FMpi::MpiAssert(MPI_Allgather(&borderLeavesState, sizeof(LeafInfo)*2, MPI_BYTE,
                                           allProcFirstLeafStates.get(), sizeof(LeafInfo)*2, MPI_BYTE, communicator.getComm()),__LINE__);
 
-            FVector<MPI_Request> requests;
+            std::vector<MPI_Request> requests;
 
             // Find what to send/recv from who
             bool hasSentFirstLeaf = false;
@@ -209,10 +214,9 @@ public:
                 // We found someone
                 if(idProcToSendTo != myRank && allProcFirstLeafStates[(idProcToSendTo)*2 + 1].mindex == borderLeavesState[0].mindex){
                     // Post and send message for the first leaf
-                    requests.push((MPI_Request)0);
-                    FAssertLF(borderLeavesState[0].nbParts < std::numeric_limits<int>::max());
-                    FMpi::MpiAssert(MPI_Isend(&workingArray[0], int(borderLeavesState[0].nbParts), MPI_BYTE, idProcToSendTo,
-                            FMpi::TagExchangeIndexs, communicator.getComm(), &requests[0]),__LINE__);
+                    FMpi::ISendSplit(&workingArray[0], borderLeavesState[0].nbParts, idProcToSendTo,
+                            FMpi::TagExchangeIndexs, communicator, &requests);
+                    FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] send " << borderLeavesState[0].nbParts << " to " << idProcToSendTo << "\n"; FLog::Controller.flush(); );
                     hasSentFirstLeaf = true;
                 }
             }
@@ -239,10 +243,9 @@ public:
                     for(int postRecvIdx = (myRank+1); postRecvIdx <= idProcToRecvFrom ; ++postRecvIdx){
                         // If there are some on this proc
                         if(allProcFirstLeafStates[(postRecvIdx)*2].mindex != noDataFlag){
-                            requests.push((MPI_Request)0);
-                            FAssertLF(allProcFirstLeafStates[(postRecvIdx)*2].nbParts < std::numeric_limits<int>::max());
-                            FMpi::MpiAssert(MPI_Irecv(&receivedParticles[postPositionRecv], int(allProcFirstLeafStates[(postRecvIdx)*2].nbParts), MPI_BYTE, postRecvIdx,
-                                            FMpi::TagExchangeIndexs, communicator.getComm(), &requests[0]),__LINE__);
+                            FMpi::IRecvSplit(&receivedParticles[postPositionRecv], allProcFirstLeafStates[(postRecvIdx)*2].nbParts, postRecvIdx,
+                                    FMpi::TagExchangeIndexs, communicator, &requests);
+                            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] recv " << allProcFirstLeafStates[(postRecvIdx)*2].nbParts << " from " << postRecvIdx << "\n"; FLog::Controller.flush(); );
                             // Inc the write position
                             postPositionRecv += allProcFirstLeafStates[(postRecvIdx)*2].nbParts;
                         }
@@ -252,7 +255,7 @@ public:
             }
 
             // Finalize communication
-            FMpi::MpiAssert(MPI_Waitall(int(requests.getSize()), requests.data(), MPI_STATUSES_IGNORE),__LINE__);
+            FMpi::MpiAssert(MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE),__LINE__);
 
             // IF we sent we need to remove the first leaf
             if(hasSentFirstLeaf){
@@ -282,6 +285,7 @@ public:
                 delete[] workingArray;
                 workingArray   = particlesWithExtension;
                 (*workingSize) = finalParticlesNumber;
+                leavesInfo[leavesInfo.getSize()-1].nbParts += receivedParticles.size();
             }
         }
         {//Filling the Array with leaves and parts //// COULD BE MOVED IN AN OTHER FUCTION
@@ -319,8 +323,6 @@ public:
                                          const FSize leavesOffsetInParticles[], const ParticleClass particlesArrayInLeafOrder[],
                                          const FSize currentNbLeaves,
                                          const FSize currentNbParts, FAbstractBalanceAlgorithm * balancer){
-        const FSize MAX_BYTE_PER_MPI_MESS = 2000000000;
-        const FSize MAX_PARTICLES_PER_MPI_MESS = FMath::Max(FSize(1), FSize(MAX_BYTE_PER_MPI_MESS/sizeof(ParticleClass)));
         const int myRank = communicator.processId();
         const int nbProcs = communicator.processCount();
 
@@ -336,7 +338,10 @@ public:
             FMpi::MpiAssert(MPI_Allgather(const_cast<FSize*>(&currentNbLeaves), 1, MPI_LONG_LONG_INT, numberOfLeavesPerProc.get(),
                                           1, MPI_LONG_LONG_INT, communicator.getComm()), __LINE__);
 
-            //prefix sum
+
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Exchange number of leaves\n"; FLog::Controller.flush(); );
+
+            // prefix sum
             std::unique_ptr<FSize[]> diffNumberOfLeavesPerProc(new FSize[nbProcs+1]);
             diffNumberOfLeavesPerProc[0] = 0;
             for(int idxProc = 0 ; idxProc < nbProcs ; ++idxProc ){
@@ -350,39 +355,53 @@ public:
             for(int idxProc = 0 ; idxProc < nbProcs ; ++idxProc){
                 allObjectives[idxProc].first  = balancer->getLeft(totalNumberOfLeavesInSimulation,nbProcs,idxProc);
                 allObjectives[idxProc].second = balancer->getRight(totalNumberOfLeavesInSimulation,nbProcs,idxProc);
+                if(idxProc != 0) FAssertLF(allObjectives[idxProc].first == allObjectives[idxProc-1].second);
             }
 
             // Ask for the pack to send
             std::pair<size_t, size_t> myCurrentInter = {diffNumberOfLeavesPerProc[myRank], diffNumberOfLeavesPerProc[myRank+1]};
             const std::vector<FEqualize::Package> packsToSend = FEqualize::GetPackToSend(myCurrentInter, allObjectives);
-            std::unique_ptr<FSize[]> nbPartsPerPackToSend(new FSize[packsToSend.size()]);
+
+            FAssertLF((currentNbLeaves == 0 && packsToSend.size() == 0) ||
+                      (packsToSend.size() && FSize(packsToSend[packsToSend.size()-1].elementTo) == currentNbLeaves));
+
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Get my interval (" << packsToSend.size() << ")\n"; FLog::Controller.flush(); );
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Send data\n"; FLog::Controller.flush(); );
+
             // Store the requests
-            std::vector<MPI_Request> requestsParts;
             std::vector<MPI_Request> requestsNbParts;
+            requestsNbParts.reserve(packsToSend.size());
+
             // Send every thing except for me or if size == 0
+            FSize totalSend = 0;
+            FSize sendToMe = 0;
             for(unsigned int idxPack = 0; idxPack< packsToSend.size() ; ++idxPack){
                 const FEqualize::Package& pack = packsToSend[idxPack];
+
+                if(idxPack != 0) FAssertLF(packsToSend[idxPack].elementFrom == packsToSend[idxPack-1].elementTo);
+                const long long int nbPartsPerPackToSend = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom];
+                totalSend += nbPartsPerPackToSend;
+
                 if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){
                     // If not to me and if there is something to send
-                    nbPartsPerPackToSend[idxPack] = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom];
+                    FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] pre-send to " << pack.idProc << " nb " << nbPartsPerPackToSend
+                         << " from " << pack.elementFrom << " to " << pack.elementTo << " \n"; FLog::Controller.flush(); );
                     // Send the size of the data
                     requestsNbParts.emplace_back();
-                    FMpi::MpiAssert(MPI_Isend(&nbPartsPerPackToSend[idxPack],1,MPI_LONG_LONG_INT,pack.idProc,
-                                              FMpi::TagExchangeIndexs+1, communicator.getComm(), &requestsNbParts.back()),__LINE__);
-                    // Send the data
-                    for(FSize idxMess = 0 ; idxMess < nbPartsPerPackToSend[idxPack]; idxMess += MAX_PARTICLES_PER_MPI_MESS){
-                        const int nbElementsInMessage = int(FMath::Min(nbPartsPerPackToSend[idxPack]-idxMess, MAX_PARTICLES_PER_MPI_MESS));
-                        requestsParts.emplace_back();
-                        FMpi::MpiAssert(MPI_Isend(const_cast<ParticleClass*>(&particlesArrayInLeafOrder[leavesOffsetInParticles[pack.elementFrom]+idxMess]),
-                                int(sizeof(ParticleClass)*nbElementsInMessage),
-                                MPI_BYTE, pack.idProc, int(FMpi::TagExchangeIndexs + 2 + idxMess), communicator.getComm(), &requestsParts.back()), __LINE__);
-                    }
+                    FMpi::MpiAssert(MPI_Isend(&nbPartsPerPackToSend,1,MPI_LONG_LONG_INT,pack.idProc,
+                                              FMpi::TagExchangeIndexs, communicator.getComm(), &requestsNbParts.back()),__LINE__);
+
                 }
                 else {
-                    // Nothing to send
-                    nbPartsPerPackToSend[idxPack] = 0;
+                    sendToMe = nbPartsPerPackToSend;
+                    FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] skip " << idxPack
+                         << " from " << pack.elementFrom << " to " << pack.elementTo <<  " \n"; FLog::Controller.flush(); );
                 }
             }
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Send done \n"; FLog::Controller.flush(); );
+            // Ensure everything has been proceed
+            FAssertLF(totalSend == currentNbParts);
+
             // Compute the current intervals
             std::vector< std::pair<size_t,size_t> > allCurrentIntervals;
             allCurrentIntervals.resize(nbProcs);
@@ -391,18 +410,24 @@ public:
                 allCurrentIntervals[idxProc].second = diffNumberOfLeavesPerProc[idxProc+1];
             }
             // Ask the packs to receive to fill my objective
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Get my receive interval \n"; FLog::Controller.flush(); );
             std::pair<size_t, size_t> myObjective = allObjectives[myRank];
-            const std::vector<FEqualize::Package> packsToRecv = FEqualize::GetPackToRecv(myObjective, allCurrentIntervals);
+            const std::vector<FEqualize::Package> packsToRecv = FEqualize::GetPackToRecv(myObjective, allCurrentIntervals);            
 
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] recv nb particles \n"; FLog::Controller.flush(); );
             // Count the number of parts to receive
             std::unique_ptr<FSize[]> nbPartsPerPackToRecv(new FSize[packsToRecv.size()]);
             for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){
                 const FEqualize::Package& pack = packsToRecv[idxPack];
+
+                if(idxPack != 0) FAssertLF(packsToRecv[idxPack].elementFrom == packsToRecv[idxPack-1].elementTo);
+
                 if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){
+                    FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] pre-recv from " << pack.idProc << " \n"; FLog::Controller.flush(); );
                     // We need to know how much particles to receive
                     requestsNbParts.emplace_back();
                     FMpi::MpiAssert(MPI_Irecv(&nbPartsPerPackToRecv[idxPack], 1, MPI_LONG_LONG_INT, pack.idProc,
-                                              FMpi::TagExchangeIndexs+1, communicator.getComm(), &requestsNbParts.back()), __LINE__);
+                                              FMpi::TagExchangeIndexs, communicator.getComm(), &requestsNbParts.back()), __LINE__);
                 }
                 else{
                     if(pack.idProc == myRank){
@@ -410,6 +435,9 @@ public:
                         const FSize sourcePosition = FMath::Max(myObjective.first, myCurrentInter.first) - myCurrentInter.first;
                         const FSize nbLeavesToCopy = pack.elementTo-pack.elementFrom;
                         nbPartsPerPackToRecv[idxPack] = leavesOffsetInParticles[sourcePosition+nbLeavesToCopy] - leavesOffsetInParticles[sourcePosition];
+                        FAssertLF(nbPartsPerPackToRecv[idxPack] == sendToMe);
+                        FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] skip recv " <<
+                             idxPack << " nb " << nbPartsPerPackToRecv[idxPack] << " \n"; FLog::Controller.flush(); );
                     }
                     else{
                         // Nothing to receive from this so avoid communication
@@ -418,33 +446,74 @@ public:
                 }
             }
 
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Wait \n"; FLog::Controller.flush(); );
+
             FMpi::MpiAssert(MPI_Waitall(int(requestsNbParts.size()), requestsNbParts.data(), MPI_STATUSES_IGNORE), __LINE__);
 
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Wait Done \n"; FLog::Controller.flush(); );
+
+            std::vector<MPI_Request> requestsParts;
+
+            for(unsigned int idxPack = 0; idxPack< packsToSend.size() ; ++idxPack){
+                const FEqualize::Package& pack = packsToSend[idxPack];
+                if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){
+                    const long long int nbPartsPerPackToSend = leavesOffsetInParticles[pack.elementTo]-leavesOffsetInParticles[pack.elementFrom];
+
+                    FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] send to "
+                          << pack.idProc << " nb " << nbPartsPerPackToSend << " \n"; FLog::Controller.flush(); );
+
+                    FMpi::ISendSplit(&particlesArrayInLeafOrder[leavesOffsetInParticles[pack.elementFrom]],
+                                    nbPartsPerPackToSend,
+                                    pack.idProc,
+                                    FMpi::TagExchangeIndexs + 1,
+                                    communicator,
+                                    &requestsParts);
+                }
+            }
+
+
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] barrier after all send \n"; FLog::Controller.flush(); );
+
+            ////////////////////////////////////////////////////////////////
             // Count the number of leaf to receive
+            ////////////////////////////////////////////////////////////////
             FSize totalPartsToReceive = 0;
             for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){
                 totalPartsToReceive += nbPartsPerPackToRecv[idxPack];
             }
 
-            std::vector<ParticleClass> particlesRecvBuffer;
+            std::unique_ptr<ParticleClass[]> particlesRecvBuffer(new ParticleClass[totalPartsToReceive]);
+
+            ////////////////////////////////////////////////////////////////
             // Post all the receive and copy mine
+            // it is based on the nbPartsPerPackToRecv array
+            ////////////////////////////////////////////////////////////////
             if(totalPartsToReceive){
-                particlesRecvBuffer.resize(totalPartsToReceive);
                 FSize offsetToRecv = 0;
                 for(unsigned int idxPack = 0; idxPack < packsToRecv.size(); ++idxPack){
                     const FEqualize::Package& pack = packsToRecv[idxPack];
+                    // If it is not from me
                     if(pack.idProc != myRank && 0 < (pack.elementTo-pack.elementFrom)){
-                        for(FSize idxMess = 0 ; idxMess < nbPartsPerPackToRecv[idxPack]; idxMess += MAX_PARTICLES_PER_MPI_MESS){
-                            const int nbElementsInMessage = int(FMath::Min(nbPartsPerPackToRecv[idxPack]-idxMess, MAX_PARTICLES_PER_MPI_MESS));
-                            requestsParts.emplace_back();
-                            FMpi::MpiAssert( MPI_Irecv(&particlesRecvBuffer[offsetToRecv+idxMess],
-                                             int(sizeof(ParticleClass)*nbElementsInMessage), MPI_BYTE, pack.idProc,
-                                             int(FMpi::TagExchangeIndexs + 2 + idxMess), communicator.getComm(), &requestsParts.back()), __LINE__);
-                        }
+                        FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] recv from "
+                              << pack.idProc << " nb " << nbPartsPerPackToRecv[idxPack] << " from " << pack.elementFrom << "\n"; FLog::Controller.flush(); );
+
+                        // We store from offset, and use nbPartsPerPackToRecv has the number
+                        FMpi::IRecvSplit(&particlesRecvBuffer[offsetToRecv],
+                                        nbPartsPerPackToRecv[idxPack],
+                                        pack.idProc,
+                                        FMpi::TagExchangeIndexs + 1,
+                                        communicator,
+                                        &requestsParts);
+
                     }
+                    // it is from me, just copy
                     else if(pack.idProc == myRank){
+                        FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] copy "
+                             << idxPack << " nb " << nbPartsPerPackToRecv[idxPack] << " from " << pack.elementFrom << " \n"; FLog::Controller.flush(); );
                         // Copy my particles
                         const FSize sourcePosition = FMath::Max(myObjective.first, myCurrentInter.first) - myCurrentInter.first;
+                        // We store from offset, and use nbPartsPerPackToRecv has the number
+                        // The reading position is the offset of the first leaf we own
                         memcpy(&particlesRecvBuffer[offsetToRecv], &particlesArrayInLeafOrder[leavesOffsetInParticles[sourcePosition]],
                                 nbPartsPerPackToRecv[idxPack]*sizeof(ParticleClass));
                     }
@@ -452,11 +521,15 @@ public:
                 }
             }
 
-            // Finalize communication
-            FMpi::MpiAssert(MPI_Waitall(int(requestsParts.size()), requestsParts.data(), MPI_STATUSES_IGNORE), __LINE__);
+
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] pre Wait \n"; FLog::Controller.flush(); );
+
+            FMpi::Assert( MPI_Waitall(int(requestsParts.size()), requestsParts.data(), MPI_STATUSES_IGNORE),  __LINE__ );
+
+            FLOG(if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << communicator.processId() << "] Wait Done \n"; FLog::Controller.flush(); );
 
             // Insert in the particle saver
-            for(FSize idPartsToStore = 0 ; idPartsToStore < int(particlesRecvBuffer.size()) ; ++idPartsToStore){
+            for(FSize idPartsToStore = 0 ; idPartsToStore < totalPartsToReceive ; ++idPartsToStore){
                 particlesSaver->push(particlesRecvBuffer[idPartsToStore]);
             }
         }
@@ -471,7 +544,7 @@ public:
                                            const FPoint<FReal>& boxCenter, const FReal boxWidth, const int treeHeight,
                                            ContainerClass* particleSaver, FAbstractBalanceAlgorithm* balancer, const SortingType sortingType = QuickSort){
 
-        FLOG( FLog::Controller << "Particles Distribution: "  << "Enter DistributeArrayToContainer\n" ; FLog::Controller.flush(); );
+        FLOG( FLog::Controller << "["  << communicator.processId() << "] Particles Distribution: "  << "Enter DistributeArrayToContainer\n" ; FLog::Controller.flush(); );
         FLOG( FTic timer );
 
         IndexedParticle* sortedParticlesArray = nullptr;
@@ -479,9 +552,13 @@ public:
         // From ParticleClass get array of IndexedParticle sorted
         GetSortedParticlesFromArray(communicator, originalParticlesArray, originalNbParticles, sortingType, boxCenter, boxWidth, treeHeight,
                                     &sortedParticlesArray, &nbParticlesInArray);
-        FLOG( FLog::Controller << "Particles Distribution: "  << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
+        FLOG( FLog::Controller << "["  << communicator.processId() << "] Particles Distribution: "  << "\t GetSortedParticlesFromArray is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
         FLOG( timer.tic() );
 
+//        for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){
+//            particleSaver->push(sortedParticlesArray[idx].particle);
+//        }
+
         ParticleClass* particlesArrayInLeafOrder = nullptr;
         FSize * leavesOffsetInParticles = nullptr;
         FSize nbLeaves = 0;
@@ -489,7 +566,11 @@ public:
         MergeSplitedLeaves(communicator, sortedParticlesArray, &nbParticlesInArray, &leavesOffsetInParticles, &particlesArrayInLeafOrder, &nbLeaves);
         delete[] sortedParticlesArray;
 
-        FLOG( FLog::Controller << "Particles Distribution: "  << "\t MergeSplitedLeaves is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
+//        for(int idx = 0 ; idx < nbParticlesInArray ; ++idx){
+//            particleSaver->push(particlesArrayInLeafOrder[idx]);
+//        }
+
+        FLOG( FLog::Controller << "["  << communicator.processId() << "] Particles Distribution: "  << "\t MergeSplitedLeaves is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
         FLOG( timer.tic() );
 
         // Equalize and balance
@@ -498,9 +579,9 @@ public:
         delete[] particlesArrayInLeafOrder;
         delete[] leavesOffsetInParticles;
 
-        FLOG( FLog::Controller << "Particles Distribution: "  << "\t EqualizeAndFillContainer is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
+        FLOG( FLog::Controller << "["  << communicator.processId() << "] Particles Distribution: "  << "\t EqualizeAndFillContainer is over (" << timer.tacAndElapsed() << "s)\n"; FLog::Controller.flush(); );
 
-        FLOG( FLog::Controller << "Particles Distribution: "  << "\t DistributeArrayToContainer is over (" << timer.cumulated() << "s)\n"; FLog::Controller.flush(); );
+        FLOG( FLog::Controller << "["  << communicator.processId() << "] Particles Distribution: "  << "\t DistributeArrayToContainer is over (" << timer.cumulated() << "s)\n"; FLog::Controller.flush(); );
 
 #ifdef SCALFMM_USE_LOG
         /** To produce stats after the Equalize phase  */
@@ -539,4 +620,10 @@ public:
 
 };
 
+
+#ifdef SCALFMM_USE_LOG
+template<class FReal, class ParticleClass>
+const bool FMpiTreeBuilder<FReal,ParticleClass>::VerboseLog = FEnv::GetBool("SCALFMM_DEBUG_LOG", false);
+#endif
+
 #endif // FMPITREEBUILDER_H
diff --git a/Src/GroupTree/Core/FGroupOfParticles.hpp b/Src/GroupTree/Core/FGroupOfParticles.hpp
index 93478fc5ab679e466e6edfd4e8b8911a21631709..4225095238331ac3e9266d3aea9632163c061ff8 100644
--- a/Src/GroupTree/Core/FGroupOfParticles.hpp
+++ b/Src/GroupTree/Core/FGroupOfParticles.hpp
@@ -68,8 +68,6 @@ protected:
     BlockHeader*    blockHeader;
     //< Pointer to leaves information
     LeafHeader*     leafHeader;
-    //< The total number of particles in the group
-    const FSize nbParticlesInGroup;
 
     //< Pointers to particle position x, y, z
     FReal* particlePosition[3];
@@ -92,7 +90,7 @@ public:
     FGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
                       unsigned char* inAttributes)
         : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
-          blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
+          blockHeader(nullptr), leafHeader(nullptr),
           attributesBuffer(nullptr), deleteBuffer(false){
         // Move the pointers to the correct position
         blockHeader         = reinterpret_cast<BlockHeader*>(inBuffer);
@@ -127,12 +125,12 @@ public:
  * @param inNumberOfLeaves total number of leaves in the interval (should be <= inEndingIndex-inEndingIndex)
  */
     FGroupOfParticles(const MortonIndex inStartingIndex, const MortonIndex inEndingIndex, const int inNumberOfLeaves, const FSize inNbParticles)
-        : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(inNbParticles),
+        : allocatedMemoryInByte(0), memoryBuffer(nullptr), blockHeader(nullptr), leafHeader(nullptr),
           deleteBuffer(true){
         memset(particlePosition, 0, sizeof(particlePosition));
         memset(particleAttributes, 0, sizeof(particleAttributes));
 
-        const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(nbParticlesInGroup+(MemoryAlignementParticles-1)*inNumberOfLeaves);
+        const FSize nbParticlesAllocatedInGroup = RoundToUpperParticles(inNbParticles+(MemoryAlignementParticles-1)*inNumberOfLeaves);
 
         // Find the number of leaf to allocate in the blocks
         FAssertLF((inEndingIndex-inStartingIndex) >= MortonIndex(inNumberOfLeaves));
@@ -161,6 +159,7 @@ public:
         blockHeader->endingIndex   = inEndingIndex;
         blockHeader->numberOfLeavesInBlock  = inNumberOfLeaves;
         blockHeader->nbParticlesAllocatedInGroup = nbParticlesAllocatedInGroup;
+        blockHeader->nbParticlesInGroup = inNbParticles;
 
         // Init particle pointers
         blockHeader->positionsLeadingDim = (sizeof(FReal) * nbParticlesAllocatedInGroup);
@@ -247,7 +246,7 @@ public:
 
     /** Get the total number of particles in the group */
     FSize getNbParticlesInGroup() const {
-        return nbParticlesInGroup;
+        return blockHeader->nbParticlesInGroup;
     }
 
     /** The size of the interval endingIndex-startingIndex (set from the constructor) */
diff --git a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp
index 04b6433c9892c29e06e6dfaa71b7842926db7a36..163585bdd98f48ac0431cb2d342d45810a81f108 100644
--- a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp
@@ -15,7 +15,6 @@
 
 #include "FOutOfBlockInteraction.hpp"
 
-#include <vector>
 #include <vector>
 
 #include <omp.h>
@@ -37,6 +36,12 @@
 #define priority_if_supported(x)
 #endif
 
+#undef taskname_if_supported
+#ifdef OPENMP_SUPPORT_TASK_NAME
+#define taskname_if_supported(n) taskname(n)
+#else
+#define taskname_if_supported(n)
+#endif
 
 
 template <class OctreeClass, class CellContainerClass, class CellClass,
@@ -372,7 +377,7 @@ protected:
 
             ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);
 
-            #pragma omp task default(shared) firstprivate(leafCells, cellPoles, containers) depend(inout: cellPoles[0]) priority_if_supported(priorities.getInsertionPosP2M())
+            #pragma omp task default(shared) firstprivate(leafCells, cellPoles, containers) depend(inout: cellPoles[0]) priority_if_supported(priorities.getInsertionPosP2M()) taskname_if_supported("P2M")
             {
                 FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, leafCells->getStartingIndex() * 20 * 8, "P2M"));
                 KernelClass*const kernel = kernels[omp_get_thread_num()];
@@ -417,7 +422,7 @@ protected:
                     subCellGroup = (*iterChildCells);
                     subCellGroupPoles = (*iterChildCells)->getRawMultipoleBuffer();
 
-                    #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellPoles, subCellGroup, subCellGroupPoles) depend(commute_if_supported: cellPoles[0]) depend(in: subCellGroupPoles[0])  priority_if_supported(priorities.getInsertionPosM2M(idxLevel))
+                    #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellPoles, subCellGroup, subCellGroupPoles) depend(commute_if_supported: cellPoles[0]) depend(in: subCellGroupPoles[0]) priority_if_supported(priorities.getInsertionPosM2M(idxLevel)) taskname_if_supported("M2M")
                     {
                         KernelClass*const kernel = kernels[omp_get_thread_num()];
                         const MortonIndex firstParent = FMath::Max(currentCells->getStartingIndex(), subCellGroup->getStartingIndex()>>3);
@@ -492,7 +497,7 @@ protected:
                     PoleCellClass* cellPoles = currentCells->getRawMultipoleBuffer();
                     LocalCellClass* cellLocals = currentCells->getRawLocalBuffer();
 
-#pragma omp task default(none) firstprivate(currentCells, cellPoles, cellLocals, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellPoles[0])  priority_if_supported(priorities.getInsertionPosM2L(idxLevel))
+#pragma omp task default(none) firstprivate(currentCells, cellPoles, cellLocals, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellPoles[0])  priority_if_supported(priorities.getInsertionPosM2L(idxLevel)) taskname_if_supported("M2L")
                     {
                         FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((currentCells->getStartingIndex() *20) + idxLevel ) * 8 + 2, "M2L"));
                         const MortonIndex blockStartIdx = currentCells->getStartingIndex();
@@ -555,7 +560,7 @@ protected:
                         LocalCellClass* cellOtherLocals = cellsOther->getRawLocalBuffer();
                         const std::vector<OutOfBlockInteraction>* outsideInteractions = &(*currentInteractions).interactions;
 
-                        #pragma omp task default(none) firstprivate(currentCells, cellLocals, outsideInteractions, cellsOther, cellOtherPoles, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellOtherPoles[0])  priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel))
+                        #pragma omp task default(none) firstprivate(currentCells, cellLocals, outsideInteractions, cellsOther, cellOtherPoles, idxLevel) depend(commute_if_supported: cellLocals[0]) depend(in: cellOtherPoles[0])  priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) taskname_if_supported("M2L-out")
                         {
                             FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (((currentCells->getStartingIndex()+1) * (cellsOther->getStartingIndex()+2)) * 20 + idxLevel) * 8 + 3, "M2L-ext"));
                             KernelClass*const kernel = kernels[omp_get_thread_num()];
@@ -571,7 +576,7 @@ protected:
                             }
                         }
 
-                        #pragma omp task default(none) firstprivate(currentCells, cellPoles, outsideInteractions, cellsOther, cellOtherLocals, idxLevel) depend(commute_if_supported: cellOtherLocals[0]) depend(in: cellPoles[0])  priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel))
+                        #pragma omp task default(none) firstprivate(currentCells, cellPoles, outsideInteractions, cellsOther, cellOtherLocals, idxLevel) depend(commute_if_supported: cellOtherLocals[0]) depend(in: cellPoles[0])  priority_if_supported(priorities.getInsertionPosM2LExtern(idxLevel)) taskname_if_supported("M2L-out")
                         {
                             FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (((currentCells->getStartingIndex()+1) * (cellsOther->getStartingIndex()+1)) * 20 + idxLevel) * 8 + 3, "M2L-ext"));
                             KernelClass*const kernel = kernels[omp_get_thread_num()];
@@ -631,7 +636,7 @@ protected:
                     subCellLocalGroupsLocal = (*iterChildCells)->getRawLocalBuffer();
 
                     if(noCommuteAtLastLevel == false || idxLevel != FAbstractAlgorithm::lowerWorkingLevel - 2){
-                        #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2L(idxLevel))
+                        #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) taskname_if_supported("L2L")
                         {
                             KernelClass*const kernel = kernels[omp_get_thread_num()];
 
@@ -674,7 +679,7 @@ protected:
                         }
                     }
                     else{
-                        #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(inout: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2L(idxLevel))
+                        #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(inout: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2L(idxLevel)) taskname_if_supported("L2L")
                         {
                             KernelClass*const kernel = kernels[omp_get_thread_num()];
 
@@ -760,7 +765,7 @@ protected:
                     unsigned char* containersOtherDown = containersOther->getRawAttributesBuffer();
                     const std::vector<OutOfBlockInteraction>* outsideInteractions = &(*currentInteractions).interactions;
 
-#pragma omp task default(none) firstprivate(containers, containersDown, containersOther, containersOtherDown, outsideInteractions) depend(commute_if_supported: containersOtherDown[0], containersDown[0])  priority_if_supported(priorities.getInsertionPosP2PExtern())
+#pragma omp task default(none) firstprivate(containers, containersDown, containersOther, containersOtherDown, outsideInteractions) depend(commute_if_supported: containersOtherDown[0], containersDown[0])  priority_if_supported(priorities.getInsertionPosP2PExtern()) taskname_if_supported("P2P-out")
                     {
                         FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((containersOther->getStartingIndex()+1) * (containers->getStartingIndex()+1))*20*8 + 6, "P2P-ext"));
                         KernelClass*const kernel = kernels[omp_get_thread_num()];
@@ -798,7 +803,7 @@ protected:
                 ParticleGroupClass* containers = (*iterParticles);
                 unsigned char* containersDown = containers->getRawAttributesBuffer();
 
-                #pragma omp task default(none) firstprivate(containers, containersDown) depend(commute_if_supported: containersDown[0])  priority_if_supported(priorities.getInsertionPosP2P())
+                #pragma omp task default(none) firstprivate(containers, containersDown) depend(commute_if_supported: containersDown[0])  priority_if_supported(priorities.getInsertionPosP2P()) taskname_if_supported("P2P")
                 {
                     FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, containers->getStartingIndex()*20*8 + 5, "P2P"));
                     const MortonIndex blockStartIdx = containers->getStartingIndex();
@@ -853,7 +858,7 @@ protected:
             ParticleGroupClass* containers = tree->getParticleGroup(idxGroup);
             unsigned char* containersDown = containers->getRawAttributesBuffer();
 
-            #pragma omp task default(shared) firstprivate(leafCells, cellLocals, containers, containersDown) depend(commute_if_supported: containersDown[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2P())
+            #pragma omp task default(shared) firstprivate(leafCells, cellLocals, containers, containersDown) depend(commute_if_supported: containersDown[0]) depend(in: cellLocals[0])  priority_if_supported(priorities.getInsertionPosL2P()) taskname_if_supported("L2P")
             {
                 FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, (leafCells->getStartingIndex()*20*8) + 7, "L2P"));
                 KernelClass*const kernel = kernels[omp_get_thread_num()];
diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
index eba5a5cd192f366a62416b36e8132d0252d433fc..562e3789b23b31ec0cca5ed1df13a9e21d380293 100644
--- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp
@@ -250,6 +250,27 @@ public:
 #endif
     }
 
+    void syncData(){
+        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
+            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
+            }
+        }
+        {
+            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
+                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].symb);
+                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].down);
+            }
+        }
+    }
+
     ~FGroupTaskStarPUAlgorithm(){
         starpu_resume();
 
@@ -338,6 +359,11 @@ protected:
 
         FLOG( FLog::Controller << "\t\t Submitting the tasks took " << timerSoumission.tacAndElapsed() << "s\n" );
         starpu_task_wait_for_all();
+
+        FLOG( FTic timerSync; );
+        syncData();
+        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
+
         starpu_pause();
 
 #ifdef STARPU_USE_CPU
diff --git a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
index fb44ebdc6d04a9c15e2da2b631fdc8563613803d..2dc3e070d1c71216829fbb57a04a5887e4762eec 100644
--- a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
+++ b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp
@@ -265,6 +265,27 @@ public:
 #endif
     }
 
+    void syncData(){
+        for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){
+            for(int idxHandle = 0 ; idxHandle < int(cellHandles[idxLevel].size()) ; ++idxHandle){
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].symb, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].symb);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].up, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].up);
+                starpu_data_acquire(cellHandles[idxLevel][idxHandle].down, STARPU_R);
+                starpu_data_release(cellHandles[idxLevel][idxHandle].down);
+            }
+        }
+        {
+            for(int idxHandle = 0 ; idxHandle < int(particleHandles.size()) ; ++idxHandle){
+                starpu_data_acquire(particleHandles[idxHandle].symb, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].symb);
+                starpu_data_acquire(particleHandles[idxHandle].down, STARPU_R);
+                starpu_data_release(particleHandles[idxHandle].down);
+            }
+        }
+    }
+
     ~FGroupTaskStarPUMpiAlgorithm(){
         starpu_resume();
 
@@ -324,6 +345,7 @@ public:
     }
 
 protected:
+
     /**
       * Runs the complete algorithm.
       */
@@ -362,6 +384,11 @@ protected:
 #endif
 
         starpu_task_wait_for_all();
+
+        FLOG( FTic timerSync; );
+        syncData();
+        FLOG( FLog::Controller << "\t\t Moving data to the host took " << timerSync.tacAndElapsed() << "s\n" );
+
         starpu_pause();
 
 #ifdef STARPU_USE_CPU
diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
index cd1980ca2599ea422155b335674bc33e093c652b..9526174d8dada5fe40a04af7960654c15978908e 100644
--- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
+++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.cu
@@ -65,70 +65,48 @@ __host__ void FCuda__bottomPassCallback(unsigned char* leafCellsPtr, std::size_t
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__upwardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-                                         FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-                                         FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-                                         int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){
+                                         unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+                                         int idxLevel, CudaKernelClass* kernel){
     CellContainerClass currentCells(currentCellsPtr, currentCellsSize,currentCellsUpPtr,nullptr);
-    CellContainerClass subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], subCellGroupsUpPtr.values[idx], nullptr);
-    }
-
-    const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
-    const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
+    CellContainerClass subCellGroup(childCellsPtr, childCellsSize,childCellsUpPtr,nullptr);
 
-    if(firstCell == currentCells.getNumberOfCellsInBlock()){
-        return ;
-    }
+    const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3);
+    const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3);
 
-    FCudaAssertLF(nbSubCellGroups != 0);
-    int idxSubCellGroup = 0;
-    int idxChildCell = 0;
-    {// Find first child
-        const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell);
-        while(idxSubCellGroup != nbSubCellGroups
-              && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){
-            idxSubCellGroup += 1;
-        }
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
-        idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0));
-    }
-    FCudaAssertLF(idxChildCell != -1);
+    int idxParentCell = currentCells.getCellIndex(firstParent);
+    int idxChildCell = subCellGroup.getFistChildIdx(firstParent);
 
-    for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){
-        typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(cellIdx);
-        FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx));
+    while(true){
+        typename CellContainerClass::CompleteCellClass cell = currentCells.getUpCell(idxParentCell);
         typename CellContainerClass::CompleteCellClass child[8];
 
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
 
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = nullptr;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7);
-            FCudaAssertLF(child[idxChild].symb == nullptr);
-            child[idxChild] = subCellGroups[idxSubCellGroup].getUpCell(idxChildCell);
+        do{
+            const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7);
+            child[idxChild] = subCellGroup.getUpCell(idxChildCell);
 
             idxChildCell += 1;
-            if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
-        }
+        }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3));
 
         kernel->M2M(cell, child, idxLevel);
+
+        if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
     }
 }
 
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-                                        FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-                                        FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-                                        int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
+                                        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+                                        int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
 
     FCuda__upwardPassPerform
@@ -136,8 +114,8 @@ __host__ void FCuda__upwardPassCallback(unsigned char* currentCellsPtr, std::siz
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>
                             (currentCellsPtr, currentCellsSize,currentCellsUpPtr,
-                             subCellGroupsPtr, subCellGroupsSize,subCellGroupsUpPtr,
-                             nbSubCellGroups, idxLevel, kernel);
+                             childCellsPtr, childCellsSize,childCellsUpPtr,
+                             idxLevel, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
 }
@@ -169,11 +147,7 @@ __global__  void FCuda__transferInoutPassPerformMpi(unsigned char* currentCellsP
             typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
             FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
 
-            typename CellContainerClass::CompleteCellClass interactions[343];
-            memset(interactions, 0, 343*sizeof(interactions[0]));
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell;
-            const int counter = 1;
-            kernel->M2L( cell , interactions, counter, idxLevel);
+            kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
         }
     }
 }
@@ -230,8 +204,7 @@ __global__  void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, st
         const int3 coord = (FCudaTreeCoordinate::ConvertCoordinate(cell.symb->coordinates));
         int counter = FCudaTreeCoordinate::GetInteractionNeighbors(coord, idxLevel,interactionsIndexes,interactionsPosition);
 
-        typename CellContainerClass::CompleteCellClass interactions[343];
-        memset(interactions, 0, 343*sizeof(interactions[0]));
+        typename CellContainerClass::CompleteCellClass interactions[189];
         int counterExistingCell = 0;
 
         for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
@@ -239,15 +212,14 @@ __global__  void FCuda__transferInPassPerform(unsigned char* currentCellsPtr, st
                 const int cellPos = currentCells.getCellIndex(interactionsIndexes[idxInter]);
                 if(cellPos != -1){
                     typename CellContainerClass::CompleteCellClass interCell = currentCells.getUpCell(cellPos);
-                    FCudaAssertLF(interCell.symb->mortonIndex == interactionsIndexes[idxInter]);
-                    FCudaAssertLF(interactions[interactionsPosition[idxInter]].symb == nullptr);
-                    interactions[interactionsPosition[idxInter]] = interCell;
+                    interactions[counterExistingCell] = interCell;
+                    interactionsPosition[counterExistingCell] = interactionsPosition[idxInter];
                     counterExistingCell += 1;
                 }
             }
         }
 
-        kernel->M2L( cell , interactions, counterExistingCell, idxLevel);
+        kernel->M2L( cell , interactions, interactionsPosition, counterExistingCell, idxLevel);
     }
 }
 
@@ -272,36 +244,37 @@ __host__ void FCuda__transferInPassCallback(unsigned char* currentCellsPtr, std:
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-                                                unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+                                                unsigned char* currentCellsDownPtr,
                                                 unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-                                                unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-                                                int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+                                                unsigned char* externalCellsUpPtr,
+                                                int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
                                                 int nbOutsideInteractions, CudaKernelClass* kernel){
     if(blockIdx.x != 0){
         return;
     }
 
-    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
-    CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr);
+    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
+    CellContainerClass cellsOther(externalCellsPtr, externalCellsSize, externalCellsUpPtr, nullptr);
 
-    for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
-        const int cellPos = cellsOther.getCellIndex(outsideInteractions[outInterIdx].outIndex);
-        if(cellPos != -1){
-            typename CellContainerClass::CompleteCellClass interCell = cellsOther.getCompleteCell(outsideInteractions[outInterIdx].outIndex);
+    if(mode == 1){
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            typename CellContainerClass::CompleteCellClass interCell = cellsOther.getUpCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
             FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
-            typename CellContainerClass::CompleteCellClass cell = currentCells.getCompleteCell(outsideInteractions[outInterIdx].insideIdxInBlock);
-            FCudaAssertLF(cell.symb);
+            typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(outsideInteractions[outInterIdx].insideIdxInBlock);
             FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
 
-            typename CellContainerClass::CompleteCellClass interactions[343];
-            memset(interactions, 0, 343*sizeof(interactions[0]));
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell;
-            const int counter = 1;
-            kernel->M2L( cell , interactions, counter, idxLevel);
+            kernel->M2L( cell , &interCell, &outsideInteractions[outInterIdx].relativeOutPosition, 1, idxLevel);
+        }
+    }
+    else{
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            typename CellContainerClass::CompleteCellClass cell = cellsOther.getUpCell(outsideInteractions[outInterIdx].insideIdxInBlock);
+            FCudaAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
+            typename CellContainerClass::CompleteCellClass interCell = currentCells.getDownCell(outsideInteractions[outInterIdx].outsideIdxInBlock);
+            FCudaAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
 
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition].symb = nullptr;
-            interactions[FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = cell;
-            kernel->M2L( interCell , interactions, counter, idxLevel);
+            const int otherPosition = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
+            kernel->M2L( interCell , &cell, &otherPosition, 1, idxLevel);
         }
     }
 }
@@ -310,10 +283,10 @@ __global__ void FCuda__transferInoutPassPerform(unsigned char* currentCellsPtr,
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-                                               unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+                                               unsigned char* currentCellsDownPtr,
                                                unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-                                               unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-                                               int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+                                               unsigned char* externalCellsUpPtr,
+                                               int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
                                                int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
     OutOfBlockInteraction* cuOutsideInteractions;
@@ -326,10 +299,10 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
             <SymboleCellClass, PoleCellClass, LocalCellClass,
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>(currentCellsPtr, currentCellsSize,
-                                                                currentCellsUpPtr, currentCellsDownPtr,
+                                                                currentCellsDownPtr,
                                                                 externalCellsPtr, externalCellsSize,
-                                                                externalCellsUpPtr, externalCellsDownPtr,
-                                                                idxLevel, cuOutsideInteractions,
+                                                                externalCellsUpPtr,
+                                                                idxLevel, mode, cuOutsideInteractions,
                                                                 nbOutsideInteractions, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
@@ -345,77 +318,56 @@ __host__ void FCuda__transferInoutPassCallback(unsigned char* currentCellsPtr, s
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __global__ void FCuda__downardPassPerform(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-                                          FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-                                          FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-                                          int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel){
-    FCudaAssertLF(nbSubCellGroups != 0);
-    CellContainerClass currentCells(currentCellsPtr, currentCellsSize, nullptr, currentCellsDownPtr);
-    CellContainerClass subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx].reset(subCellGroupsPtr.values[idx], subCellGroupsSize.values[idx], nullptr, subCellGroupsDownPtr.values[idx]);
-    }
+                                          unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+                                          int idxLevel, CudaKernelClass* kernel){
+    CellContainerClass currentCells(currentCellsPtr, currentCellsSize,nullptr,currentCellsDownPtr);
+    CellContainerClass subCellGroup(childCellsPtr, childCellsSize,nullptr,childCellsDownPtr);
 
-    const int firstCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), blockIdx.x*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
-    const int lastCell = FCudaMin(currentCells.getNumberOfCellsInBlock(), (blockIdx.x+1)*((currentCells.getNumberOfCellsInBlock()+gridDim.x-1)/gridDim.x));
+    const MortonIndex firstParent = FCudaMax(currentCells.getStartingIndex(), subCellGroup.getStartingIndex()>>3);
+    const MortonIndex lastParent = FCudaMin(currentCells.getEndingIndex()-1, (subCellGroup.getEndingIndex()-1)>>3);
 
-    if(firstCell == currentCells.getNumberOfCellsInBlock()){
-        return ;
-    }
+    int idxParentCell = currentCells.getCellIndex(firstParent);
+    int idxChildCell = subCellGroup.getFistChildIdx(firstParent);
 
-    FCudaAssertLF(nbSubCellGroups != 0);
-    int idxSubCellGroup = 0;
-    int idxChildCell = 0;
-    {// Find first child
-        const MortonIndex mindex = currentCells.getCellMortonIndex(firstCell);
-        while(idxSubCellGroup != nbSubCellGroups
-              && (mindex < (subCellGroups[idxSubCellGroup].getStartingIndex()>>3))){
-            idxSubCellGroup += 1;
-        }
-        FCudaAssertLF(idxSubCellGroup != nbSubCellGroups);
-        idxChildCell = subCellGroups[idxSubCellGroup].getFistChildIdx(currentCells.getCellMortonIndex(0));
-    }
-    FCudaAssertLF(idxChildCell != -1);
-
-    for(int cellIdx = firstCell ; cellIdx < lastCell ; ++cellIdx){
-        typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(cellIdx);
-        FCudaAssertLF(cell.symb->mortonIndex == currentCells.getCellMortonIndex(cellIdx));
+    while(true){
+        typename CellContainerClass::CompleteCellClass cell = currentCells.getDownCell(idxParentCell);
         typename CellContainerClass::CompleteCellClass child[8];
 
+
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = nullptr;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((subCellGroups[idxSubCellGroup].getCellMortonIndex(idxChildCell)) & 7);
-            FCudaAssertLF(child[idxChild].symb == nullptr);
-            child[idxChild] = subCellGroups[idxSubCellGroup].getDownCell(idxChildCell);
+        do{
+            const int idxChild = ((subCellGroup.getCellMortonIndex(idxChildCell)) & 7);
+            child[idxChild] = subCellGroup.getDownCell(idxChildCell);
 
             idxChildCell += 1;
-            if(idxChildCell == subCellGroups[idxSubCellGroup].getNumberOfCellsInBlock()){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
-        }
+        }while(idxChildCell != subCellGroup.getNumberOfCellsInBlock() && cell.symb->mortonIndex == (subCellGroup.getCellMortonIndex(idxChildCell)>>3));
 
         kernel->L2L(cell, child, idxLevel);
+
+        if(currentCells.getCellMortonIndex(idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
     }
 }
 
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 __host__ void FCuda__downardPassCallback(unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-                                         FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-                                         FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-                                         int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
+                                        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+                                         int idxLevel, CudaKernelClass* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize){
 
     FCuda__downardPassPerform
             <SymboleCellClass, PoleCellClass, LocalCellClass,
             CellContainerClass, ParticleContainerGroupClass, ParticleGroupClass, CudaKernelClass>
             <<<inGridSize, inBlocksSize, 0, currentStream>>>
-            (currentCellsPtr, currentCellsSize, currentCellsDownPtr, subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr,
-             nbSubCellGroups, idxLevel, kernel);
+            (currentCellsPtr, currentCellsSize, currentCellsDownPtr, childCellsPtr, childCellsSize, childCellsDownPtr,
+             idxLevel, kernel);
     FCudaCheckAfterCall();
     FCudaCheck(cudaStreamSynchronize(currentStream));
 }
@@ -442,11 +394,9 @@ __global__ void FCuda__directInoutPassPerformMpi(unsigned char* containersPtr, s
         if(leafPos != -1){
             ParticleGroupClass interParticles = containersOther.template getLeaf<ParticleGroupClass>(leafPos);
             ParticleGroupClass particles = containers.template getLeaf<ParticleGroupClass>(outsideInteractions[outInterIdx].insideIdxInBlock);
-            ParticleGroupClass* interactions[27];
-            memset(interactions, 0, 27*sizeof(ParticleGroupClass*));
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = &interParticles;
-            const int counter = 1;
-            kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), &particles, &particles , interactions, counter);
+
+            kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1),
+                               &particles, &particles , &interParticles, &outsideInteractions[outInterIdx].relativeOutPosition, 1);
         }
     }
 }
@@ -502,9 +452,7 @@ __global__ void FCuda__directInPassPerform(unsigned char* containersPtr, std::si
         const int3 coord = FCudaTreeCoordinate::GetPositionFromMorton(mindex, treeHeight-1);
         int counter = FCudaTreeCoordinate::GetNeighborsIndexes(coord, treeHeight,interactionsIndexes,interactionsPosition);
 
-        ParticleGroupClass interactionsObjects[27];
-        ParticleGroupClass* interactions[27];
-        memset(interactions, 0, 27*sizeof(ParticleGroupClass*));
+        ParticleGroupClass interactionsObjects[26];
         int counterExistingCell = 0;
 
         for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
@@ -512,14 +460,13 @@ __global__ void FCuda__directInPassPerform(unsigned char* containersPtr, std::si
                 const int leafPos = containers.getLeafIndex(interactionsIndexes[idxInter]);
                 if(leafPos != -1){
                     interactionsObjects[counterExistingCell] = containers.template getLeaf<ParticleGroupClass>(leafPos);
-                    FCudaAssertLF(interactions[interactionsPosition[idxInter]] == nullptr);
-                    interactions[interactionsPosition[idxInter]] = &interactionsObjects[counterExistingCell];
+                    interactionsPosition[counterExistingCell] = interactionsPosition[idxInter];
                     counterExistingCell += 1;
                 }
             }
         }
 
-        kernel->P2P( coord, &particles, &particles , interactions, counterExistingCell);
+        kernel->P2P( coord, &particles, &particles , interactionsObjects, interactionsPosition, counterExistingCell);
     }
 }
 
@@ -548,7 +495,7 @@ __global__ void FCuda__directInoutPassPerform(unsigned char* containersPtr, std:
     }
 
     ParticleContainerGroupClass containers(containersPtr, containersSize, containersDownPtr);
-    ParticleContainerGroupClass containersOther(externalContainersPtr, externalContainersSize, externalContainersPtr);
+    ParticleContainerGroupClass containersOther(externalContainersPtr, externalContainersSize, externalContainersDownPtr);
 
     for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
         const int leafPos = containersOther.getLeafIndex(outsideInteractions[outInterIdx].outIndex);
@@ -559,15 +506,13 @@ __global__ void FCuda__directInoutPassPerform(unsigned char* containersPtr, std:
             FCudaAssertLF(containersOther.getLeafMortonIndex(leafPos) == outsideInteractions[outInterIdx].outIndex);
             FCudaAssertLF(containers.getLeafMortonIndex(outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex);
 
-            ParticleGroupClass* interactions[27];
-            memset(interactions, 0, 27*sizeof(ParticleGroupClass*));
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = &interParticles;
-            const int counter = 1;
-            kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), &particles, &particles , interactions, counter);
 
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = nullptr;
-            interactions[FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = &particles;
-            kernel->P2PRemote( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), &interParticles, &interParticles , interactions, counter);
+            kernel->P2POuter( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1),
+                               &particles , &interParticles, &outsideInteractions[outInterIdx].relativeOutPosition, 1);
+
+            const int otherPosition = FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition);
+            kernel->P2POuter( FCudaTreeCoordinate::GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1),
+                               &interParticles , &particles, &otherPosition, 1);
         }
     }
 }
@@ -682,9 +627,8 @@ template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroup
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -705,19 +649,18 @@ template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaG
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
                                         FCudaGroupOfParticles<int,0,0,int>, FCudaGroupAttachedLeaf<int,0,0,int>, FCudaEmptyKernel<int> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
+unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaEmptyKernel<int>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
@@ -770,9 +713,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -793,19 +735,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData,
 template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<float,0, 1, long long int>, FCudaGroupAttachedLeaf<float,0, 1, long long int>, FTestCudaKernels<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FTestCudaKernels<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -856,9 +797,8 @@ unsigned char* containersPtr, std::size_t containersSize,
 template void FCuda__upwardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -879,19 +819,18 @@ template void FCuda__transferInPassCallback<FTestCellPODCore, FTestCellPODData,
 template void FCuda__transferInoutPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
                                         FCudaGroupOfParticles<double,0, 1, long long int>, FCudaGroupAttachedLeaf<double,0, 1, long long int>, FTestCudaKernels<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+    int idxLevel, FTestCudaKernels<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FTestCellPODCore, FTestCellPODData, FTestCellPODData, FCudaGroupOfCells<FTestCellPODCore, FTestCellPODData, FTestCellPODData>,
@@ -936,22 +875,21 @@ template dim3 FCuda__GetBlockSize< FTestCudaKernels<double> >(FTestCudaKernels<d
 #include "../P2P/FCudaP2P.hpp"
 
 template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
 unsigned char* containersPtr, std::size_t containersSize,
     FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+    int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
     unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
     int idxLevel, const OutOfBlockInteraction* outsideInteractions,
@@ -959,32 +897,31 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #endif
 template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
     unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
     int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     unsigned char* externalContainersPtr, std::size_t externalContainersSize,
     const OutOfBlockInteraction* outsideInteractions,
@@ -992,13 +929,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #endif
 template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     const int treeHeight, FCudaP2P<float>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
     const OutOfBlockInteraction* outsideInteractions,
@@ -1006,7 +943,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<float,4, 4, float>, FCudaGroupAttachedLeaf<float,4, 4, float>, FCudaP2P<float> >
+                                        FCudaGroupOfParticles<float,1, 4, float>, FCudaGroupAttachedLeaf<float,1, 4, float>, FCudaP2P<float> >
     (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
     unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     FCudaP2P<float>* kernel, cudaStream_t currentStream,
@@ -1022,22 +959,21 @@ template dim3 FCuda__GetBlockSize< FCudaP2P<float> >(FCudaP2P<float>* cukernel);
 
 
 template void FCuda__bottomPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsUpPtr,
 unsigned char* containersPtr, std::size_t containersSize,
     FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__upwardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
     unsigned char* externalCellsPtr, std::size_t externalCellsSize, unsigned char* externalCellsUpPtr,
     int idxLevel, const OutOfBlockInteraction* outsideInteractions,
@@ -1045,32 +981,31 @@ template void FCuda__transferInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #endif
 template void FCuda__transferInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
     unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
     int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__transferInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
-    unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
-    int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
-                                        const dim3 inGridSize, const dim3 inBlocksSize);
+unsigned char* currentCellsDownPtr,
+unsigned char* externalCellsPtr, std::size_t externalCellsSize,
+unsigned char* externalCellsUpPtr,
+int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
+int nbOutsideInteractions, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+                                    const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__downardPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
+    unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+int idxLevel, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     unsigned char* externalContainersPtr, std::size_t externalContainersSize,
     const OutOfBlockInteraction* outsideInteractions,
@@ -1078,13 +1013,13 @@ template void FCuda__directInoutPassCallbackMpi<FCudaEmptyCellSymb, int, int, FC
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #endif
 template void FCuda__directInPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     const int treeHeight, FCudaP2P<double>* kernel, cudaStream_t currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     unsigned char* externalContainersPtr, std::size_t externalContainersSize, unsigned char* externalContainersDownPtr,
     const OutOfBlockInteraction* outsideInteractions,
@@ -1092,7 +1027,7 @@ template void FCuda__directInoutPassCallback<FCudaEmptyCellSymb, int, int, FCuda
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
 template void FCuda__mergePassCallback<FCudaEmptyCellSymb, int, int, FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
-                                        FCudaGroupOfParticles<double,4, 4, double>, FCudaGroupAttachedLeaf<double,4, 4, double>, FCudaP2P<double> >
+                                        FCudaGroupOfParticles<double,1, 4, double>, FCudaGroupAttachedLeaf<double,1, 4, double>, FCudaP2P<double> >
     (unsigned char* leafCellsPtr, std::size_t leafCellsSize, unsigned char* leafCellsDownPtr,
     unsigned char* containersPtr, std::size_t containersSize, unsigned char* containersDownPtr,
     FCudaP2P<double>* kernel, cudaStream_t currentStream,
diff --git a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
index 5559c908570cc75388ecfa1c2018ab4e98d53c59..4ba6615534e597cca892fbd02ce44a5c93a6e8cf 100644
--- a/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
+++ b/Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp
@@ -17,9 +17,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__upwardPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsUpPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t, 9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsUpPtr,
-    int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
+        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsUpPtr,
+     int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
@@ -43,10 +42,10 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__transferInoutPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize,
-    unsigned char* currentCellsUpPtr, unsigned char* currentCellsDownPtr,
+    unsigned char* currentCellsUpPtr,
     unsigned char* externalCellsPtr, std::size_t externalCellsSize,
-    unsigned char* externalCellsUpPtr, unsigned char* externalCellsDownPtr,
-    int idxLevel, const OutOfBlockInteraction* outsideInteractions,
+    unsigned char* externalCellsDownPtr,
+    int idxLevel, int mode, const OutOfBlockInteraction* outsideInteractions,
     int nbOutsideInteractions, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 
@@ -54,9 +53,8 @@ template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
           class CellContainerClass, class ParticleContainerGroupClass, class ParticleGroupClass, class CudaKernelClass>
 void FCuda__downardPassCallback(
     unsigned char* currentCellsPtr, std::size_t currentCellsSize, unsigned char* currentCellsDownPtr,
-    FCudaParams<unsigned char*,9> subCellGroupsPtr, FCudaParams<std::size_t,9> subCellGroupsSize,
-    FCudaParams<unsigned char*,9> subCellGroupsDownPtr,
-    int nbSubCellGroups, int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
+        unsigned char* childCellsPtr, std::size_t childCellsSize, unsigned char* childCellsDownPtr,
+        int idxLevel, CudaKernelClass* kernel, cudaStream_t 	currentStream,
                                         const dim3 inGridSize, const dim3 inBlocksSize);
 #ifdef SCALFMM_USE_MPI
 template <class SymboleCellClass, class PoleCellClass, class LocalCellClass,
diff --git a/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp b/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp
index 6eb975c226ccad4e0556554b33a9128a04a295cf..1cdecb9f2aeb6af06d35c96b3aa69044f78b2091 100644
--- a/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp
+++ b/Src/GroupTree/Cuda/FCudaEmptyKernel.hpp
@@ -22,7 +22,8 @@ public:
     __device__ void M2M(CellClass  /*pole*/, const CellClass  /*child*/[8], const int /*level*/) {
     }
 
-    __device__ void M2L(CellClass  /*pole*/, const CellClass /*distantNeighbors*/[343],
+    __device__ void M2L(CellClass  /*pole*/, const CellClass* /*distantNeighbors*/,
+                        const int* /*neighPositions*/,
         const int /*size*/, const int /*level*/) {
     }
 
@@ -34,12 +35,20 @@ public:
 
     __device__ void P2P(const int3& ,
                  ContainerClass* const  /*targets*/, const ContainerClass* const  /*sources*/,
-                 ContainerClass* const /*directNeighborsParticles*/[27], const int ){
+                 ContainerClass* const /*directNeighborsParticles*/,
+                        const int* /*neighborPositions*/, const int ){
+    }
+
+    __device__ void P2POuter(const int3& ,
+                 ContainerClass* const  /*targets*/,
+                 ContainerClass* const /*directNeighborsParticles*/,
+                              const int* /*neighborPositions*/,const int ){
     }
 
     __device__ void P2PRemote(const int3& ,
                  ContainerClass* const  /*targets*/, const ContainerClass* const  /*sources*/,
-                 ContainerClass* const /*directNeighborsParticles*/[27], const int ){
+                 ContainerClass* const /*directNeighborsParticles*/,
+                              const int* /*neighborPositions*/,const int ){
     }
 
     __host__ static FCudaEmptyKernel* InitKernelKernel(void*){
diff --git a/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp b/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp
index 275a8c1adfb2a5269da5ef74eb17b877728c505e..0ea8daecbb6c252ad7faa54837e5a20b114f9945 100644
--- a/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp
+++ b/Src/GroupTree/Cuda/FCudaGroupAttachedLeaf.hpp
@@ -37,7 +37,7 @@ public:
         positionsPointers[2] = reinterpret_cast<FReal*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*2);
 
         for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){
-            attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*(idxAttribute+3));
+            attributes[idxAttribute] = reinterpret_cast<AttributeClass*>(reinterpret_cast<unsigned char*>(inPositionBuffer) + inLeadingPosition*3 + inLeadingAttributes*idxAttribute);
         }
 
         // Redirect pointers to data
diff --git a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
index d7d8aff5cea8627d3ad487b3e1286fc1af7bed2b..a119cc22ed8b35c53fc0fc835a621fb1ce15e95f 100644
--- a/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
+++ b/Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp
@@ -59,8 +59,6 @@ protected:
     BlockHeader*    blockHeader;
     //< Pointer to leaves information
     LeafHeader*     leafHeader;
-    //< The total number of particles in the group
-    const FSize nbParticlesInGroup;
 
     //< Pointers to particle position x, y, z
     FReal* particlePosition[3];
@@ -78,11 +76,12 @@ public:
     __device__ FCudaGroupOfParticles(unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
                                      unsigned char* inAttributes)
         : allocatedMemoryInByte(inAllocatedMemoryInByte), memoryBuffer(inBuffer),
-          blockHeader(nullptr), leafHeader(nullptr), nbParticlesInGroup(0),
+          blockHeader(nullptr), leafHeader(nullptr),
           attributesBuffer(nullptr){
         // Move the pointers to the correct position
-        blockHeader         = reinterpret_cast<BlockHeader*>(memoryBuffer);
-        leafHeader          = reinterpret_cast<LeafHeader*>(memoryBuffer+sizeof(BlockHeader)+(blockHeader->numberOfLeavesInBlock*sizeof(int)));
+        blockHeader         = reinterpret_cast<BlockHeader*>(inBuffer);
+        inBuffer += sizeof(BlockHeader);
+        leafHeader          = reinterpret_cast<LeafHeader*>(inBuffer);
 
         // Init particle pointers
         // Assert blockHeader->positionsLeadingDim == (sizeof(FReal) * blockHeader->nbParticlesAllocatedInGroup);
@@ -122,7 +121,7 @@ public:
 
     /** Get the total number of particles in the group */
     __device__ FSize getNbParticlesInGroup() const {
-        return nbParticlesInGroup;
+        return blockHeader->nbParticlesInGroup;
     }
 
     /** The size of the interval endingIndex-startingIndex (set from the constructor) */
diff --git a/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp b/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp
index 4dc38f2e34d2bf71ac8f93bc9bffaa2c61012737..bf4648b2ed88db61d3e30ef262c338f147f435f4 100644
--- a/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp
+++ b/Src/GroupTree/OpenCl/FEmptyOpenCLCode.hpp
@@ -21,57 +21,52 @@ public:
                     MortonIndex insideIndex;\
                     int relativeOutPosition;\
                     int insideIdxInBlock;\
+                    int outsideIdxInBlock;\
                 } __attribute__ ((aligned (DefaultStructAlign)));\
-                struct Uptr9{\
-                    __global unsigned char* ptrs[9];\
-                } __attribute__ ((aligned (DefaultStructAlign)));\
-                struct size_t9{\
-                    size_t v[9];\
-                }__attribute__ ((aligned (DefaultStructAlign)));\
                 __kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize,__global unsigned char* leafCellsUpPtr,\
-                                                         __global unsigned char* containersPtr, size_t containersSize,\
-                                                         __global void* userkernel ){\
+                    __global unsigned char* containersPtr, size_t containersSize,\
+                    __global void* userkernel ){\
                 }\
                 __kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr,\
-                                                  struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsUpPtr,\
-                                                  int nbSubCellGroups, int idxLevel, __global void* userkernel){\
+                    __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr,\
+                    int idxLevel, __global void* userkernel){\
                 }\
                 __kernel  void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,\
-                                                             __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr,\
-                                                             int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\
-                                                             size_t nbOutsideInteractions, __global void* userkernel){\
+                    __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr,\
+                    int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\
+                    size_t nbOutsideInteractions, __global void* userkernel){\
                 }\
                 __kernel  void FOpenCL__transferInPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,\
-                                                        __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,\
-                                                       int idxLevel, __global void* userkernel){\
+                    __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,\
+                    int idxLevel, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,\
-                                                         __global unsigned char*  currentCellsUpPtr, __global unsigned char*  currentCellsDownPtr,\
-                                                         __global unsigned char* externalCellsPtr, size_t externalCellsSize,\
-                                                         __global unsigned char* externalCellsUpPtr, __global unsigned char* externalCellsDownPtr,\
-                                                         int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,\
-                                                         size_t nbOutsideInteractions, __global void* userkernel){\
+                    __global unsigned char*  currentCellsUpPtr,\
+                    __global unsigned char* externalCellsPtr, size_t externalCellsSize,\
+                    __global unsigned char* externalCellsDownPtr,\
+                    int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions,\
+                    size_t nbOutsideInteractions, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,\
-                                                   struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsDownPtr,\
-                                                   int nbSubCellGroups, int idxLevel, __global void* userkernel){\
+                    __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr,\
+                    int idxLevel, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\
-                                                          __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl,\
-                                                          const __global struct OutOfBlockInteraction* outsideInteractions,\
-                                                          size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\
+                    __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl,\
+                    const __global struct OutOfBlockInteraction* outsideInteractions,\
+                    size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\
-                                                    const int treeHeight, __global void* userkernel){\
+                    const int treeHeight, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\
-                                                       __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr,\
-                                                       const __global struct OutOfBlockInteraction* outsideInteractions,\
-                                                       size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\
+                    __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr,\
+                    const __global struct OutOfBlockInteraction* outsideInteractions,\
+                    size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){\
                 }\
                 __kernel void FOpenCL__mergePassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize, __global unsigned char* leafCellsDownPtr,\
-                                                 __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\
-                                                 __global void* userkernel){\
+                    __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,\
+                    __global void* userkernel){\
                 }";
         return kernelcode;
     }
diff --git a/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp b/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp
index 4e54284e4df2e12bcaf44d23ade01f4e751dd671..eaa6220a5d72f27210a6ea821fd53c2805f3cacf 100644
--- a/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp
+++ b/Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp
@@ -21,14 +21,6 @@
 template <class OriginalKernelClass, class KernelFilenameClass = FEmptyOpenCLCode>
 class FOpenCLDeviceWrapper {
 protected:
-    struct  alignas(FStarPUDefaultAlign::StructAlign)  Uptr9{
-        cl_mem ptrs[9];
-    };
-
-    struct alignas(FStarPUDefaultAlign::StructAlign)  size_t9{
-        size_t v[9];
-    };
-
     static void SetKernelArgs(cl_kernel& /*kernel*/, const int /*pos*/){
     }
     template <class ParamClass, class... Args>
@@ -168,17 +160,11 @@ public:
 
 
     void upwardPassPerform(cl_mem currentCellsPtr,  size_t currentCellsSize, cl_mem currentCellsUpPtr,
-                           cl_mem subCellGroupsPtr[9],  size_t subCellGroupsSize[9], cl_mem subCellGroupsUpPtr[9],
-                            int nbSubCellGroups, int idxLevel, const int intervalSize){
-        Uptr9 ptrs;
-        memcpy(ptrs.ptrs, subCellGroupsPtr, sizeof(cl_mem)*9);
-        size_t9 sizes;
-        memcpy(sizes.v, subCellGroupsSize, sizeof(size_t)*9);
-        Uptr9 ptrsUp;
-        memcpy(ptrsUp.ptrs, subCellGroupsUpPtr, sizeof(cl_mem)*9);
+                           cl_mem subCellGroupsPtr,  size_t subCellGroupsSize, cl_mem subCellGroupsUpPtr,
+                           int idxLevel, const int intervalSize){
 
         SetKernelArgs(kernel_upwardPassPerform, 0, &currentCellsPtr, &currentCellsSize, &currentCellsUpPtr,
-                      &ptrs,  &sizes, &ptrsUp, &nbSubCellGroups, &idxLevel, &user_data);
+                      &subCellGroupsPtr,  &subCellGroupsSize, &subCellGroupsUpPtr, &idxLevel, &user_data);
         const int err = clEnqueueNDRangeKernel(queue_upwardPassPerform, kernel_upwardPassPerform, kernelFilename.getNbDims(), NULL,
                                                kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
@@ -205,29 +191,22 @@ public:
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     }
 
-    void transferInoutPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsUpPtr, cl_mem currentCellsDownPtr,
-                                  cl_mem externalCellsPtr, size_t externalCellsSize, cl_mem externalCellsUpPtr, cl_mem externalCellsDownPtr,
-                                  int idxLevel, cl_mem outsideInteractionsCl, size_t outsideInteractionsSize, const int intervalSize){
-        SetKernelArgs(kernel_transferInoutPassPerform, 0, &currentCellsPtr,&currentCellsSize, &currentCellsUpPtr, &currentCellsDownPtr,
-                      &externalCellsPtr, &externalCellsSize, &externalCellsUpPtr, &externalCellsDownPtr,
-                      &idxLevel, &outsideInteractionsCl,&outsideInteractionsSize, &user_data);
+    void transferInoutPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsUpPtr,
+                                  cl_mem externalCellsPtr, size_t externalCellsSize, cl_mem externalCellsDownPtr,
+                                  int idxLevel, const int mode, cl_mem outsideInteractionsCl, size_t outsideInteractionsSize, const int intervalSize){
+        SetKernelArgs(kernel_transferInoutPassPerform, 0, &currentCellsPtr,&currentCellsSize, &currentCellsUpPtr,
+                      &externalCellsPtr, &externalCellsSize, &externalCellsDownPtr,
+                      &idxLevel, &mode, &outsideInteractionsCl,&outsideInteractionsSize, &user_data);
         const int err = clEnqueueNDRangeKernel(queue_transferInoutPassPerform, kernel_transferInoutPassPerform, kernelFilename.getNbDims(), NULL,
                                                kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
     }
 
     void downardPassPerform(cl_mem currentCellsPtr, size_t currentCellsSize, cl_mem currentCellsDownPtr,
-                            cl_mem subCellGroupsPtr[9],  size_t subCellGroupsSize[9], cl_mem subCellGroupsDownPtr[9],
-                            int nbSubCellGroups, int idxLevel, const int intervalSize){
-        Uptr9 ptrs;
-        memcpy(ptrs.ptrs, subCellGroupsPtr, sizeof(cl_mem)*9);
-        size_t9 sizes;
-        memcpy(sizes.v, subCellGroupsSize, sizeof(size_t)*9);
-        Uptr9 ptrsDown;
-        memcpy(ptrsDown.ptrs, subCellGroupsDownPtr, sizeof(cl_mem)*9);
-
+                            cl_mem subCellGroupsPtr,  size_t subCellGroupsSize, cl_mem subCellGroupsDownPtr,
+                            int idxLevel, const int intervalSize){
         SetKernelArgs(kernel_downardPassPerform, 0, &currentCellsPtr, &currentCellsSize, &currentCellsDownPtr,
-                      &ptrs, &sizes, &ptrsDown, &nbSubCellGroups, &idxLevel, &user_data);
+                      &subCellGroupsPtr, &subCellGroupsSize, &subCellGroupsDownPtr, &idxLevel, &user_data);
         const int err = clEnqueueNDRangeKernel(queue_downardPassPerform, kernel_downardPassPerform, kernelFilename.getNbDims(), NULL,
                                                kernelFilename.getNbGroups(intervalSize), kernelFilename.getGroupSize(), 0, NULL, NULL);
         if (err != CL_SUCCESS) STARPU_OPENCL_REPORT_ERROR(err);
diff --git a/Src/GroupTree/P2P/FCudaP2P.hpp b/Src/GroupTree/P2P/FCudaP2P.hpp
index e9807bacfcd322a14d0d8f1311e946269b3e83f3..41f0d93b4b1cff4a33c49331bd49c34718601164 100644
--- a/Src/GroupTree/P2P/FCudaP2P.hpp
+++ b/Src/GroupTree/P2P/FCudaP2P.hpp
@@ -6,29 +6,6 @@
 #include "../Cuda/FCudaEmptyCellSymb.hpp"
 #include "../Cuda/FCudaCompositeCell.hpp"
 
-#define DirectMacro(targetX, targetY, targetZ, targetPhys, \
-    forceX, forceY, forceZ, potential,\
-    sourcesX, sourcesY, sourcesZ, sourcesPhys)\
-{\
-    FReal dx = sourcesX - targetX;\
-    FReal dy = sourcesY - targetY;\
-    FReal dz = sourcesZ - targetZ;\
-    \
-    FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);\
-    FReal inv_distance = sqrt(inv_square_distance);\
-    \
-    inv_square_distance *= inv_distance;\
-    inv_square_distance *= targetPhys * sourcesPhys;\
-    \
-    dx *= inv_square_distance;\
-    dy *= inv_square_distance;\
-    dz *= inv_square_distance;\
-    \
-    forceX += dx;\
-    forceY += dy;\
-    forceZ += dz;\
-    sourcesPhys += inv_distance * sourcesPhys;\
-    }
 
 #define Min(x,y) ((x)<(y)?(x):(y))
 #define Max(x,y) ((x)>(y)?(x):(y))
@@ -40,6 +17,30 @@ template <class FReal>
 class FCudaP2P {
 protected:
 public:
+
+    __device__ void DirectComputation(const FReal& targetX, const FReal& targetY, const FReal& targetZ,const  FReal& targetPhys,
+                           FReal& forceX, FReal& forceY,FReal&  forceZ, FReal& potential,
+                           const FReal& sourcesX, const FReal& sourcesY, const FReal& sourcesZ, const FReal& sourcesPhys) const {
+        FReal dx = sourcesX - targetX;
+        FReal dy = sourcesY - targetY;
+        FReal dz = sourcesZ - targetZ;
+
+        FReal inv_square_distance = FReal(1.0) / (dx*dx + dy*dy + dz*dz);
+        FReal inv_distance = sqrt(inv_square_distance);
+
+        inv_square_distance *= inv_distance;
+        inv_square_distance *= targetPhys * sourcesPhys;
+
+        dx *= inv_square_distance;
+        dy *= inv_square_distance;
+        dz *= inv_square_distance;
+
+        forceX += dx;
+        forceY += dy;
+        forceZ += dz;
+        potential += inv_distance * sourcesPhys;
+    }
+
     static double DSqrt(const double val){
         return sqrt(val);
     }
@@ -48,10 +49,10 @@ public:
         return sqrtf(val);
     }
 
-    typedef FCudaGroupAttachedLeaf<FReal,4,4,FReal> ContainerClass;
+    typedef FCudaGroupAttachedLeaf<FReal,1,4,FReal> ContainerClass;
     typedef FCudaCompositeCell<FCudaEmptyCellSymb,int,int> CellClass;
 
-    static const int SHARE_SIZE = 128;
+    static const int SHARE_SIZE = 1;//128;
 
     __device__ void P2M(CellClass /*pole*/, const ContainerClass* const /*particles*/) {
     }
@@ -59,8 +60,9 @@ public:
     __device__ void M2M(CellClass  /*pole*/, const CellClass  /*child*/[8], const int /*level*/) {
     }
 
-    __device__ void M2L(CellClass  /*pole*/, const CellClass /*distantNeighbors*/[343],
-    const int /*size*/, const int /*level*/) {
+    __device__ void M2L(CellClass  /*pole*/, const CellClass* /*distantNeighbors*/,
+                        const int* /*neighPositions*/,
+                        const int /*size*/, const int /*level*/) {
     }
 
     __device__ void L2L(const CellClass  /*local*/, CellClass  /*child*/[8], const int /*level*/) {
@@ -71,25 +73,22 @@ public:
 
     __device__ void P2P(const int3& pos,
                         ContainerClass* const  targets, const ContainerClass* const  sources,
-                        ContainerClass* const directNeighborsParticles[27], const int counter){
+                        ContainerClass* const directNeighborsParticles,
+                        const int* neighborPositions, const int counter){
         // Compute with other
-        P2PRemote(pos, targets, sources, directNeighborsParticles, counter);
+        P2PRemote(pos, targets, sources, directNeighborsParticles, neighborPositions, counter);
         // Compute inside
-        const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
 
-        for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){
-            const int idxPart = (idxLoop*blockDim.x+threadIdx.x);
+        for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
             const bool threadCompute = (idxPart < targets->getNbParticles());
 
             FReal targetX, targetY, targetZ, targetPhys;
             FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
 
-            if(threadCompute){
-                targetX = targets->getPositions()[0][idxPart];
-                targetY = targets->getPositions()[1][idxPart];
-                targetZ = targets->getPositions()[2][idxPart];
-                targetPhys = targets->getAttribute(0)[idxPart];
-            }
+            targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+            targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+            targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+            targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
 
             for(int idxCopy = 0 ; idxCopy < targets->getNbParticles() ; idxCopy += SHARE_SIZE){
                 __shared__ FReal sourcesX[SHARE_SIZE];
@@ -99,57 +98,61 @@ public:
 
                 const int nbCopies = Min(SHARE_SIZE, targets->getNbParticles()-idxCopy);
                 if(threadIdx.x < nbCopies){
-                    sourcesX[threadIdx.x] = targets->getPositions()[0][idxPart];
-                    sourcesY[threadIdx.x] = targets->getPositions()[1][idxPart];
-                    sourcesZ[threadIdx.x] = targets->getPositions()[2][idxPart];
-                    sourcesPhys[threadIdx.x] = targets->getAttribute(0)[idxPart];
+                    sourcesX[threadIdx.x] = targets->getPositions()[0][threadIdx.x+idxCopy];
+                    sourcesY[threadIdx.x] = targets->getPositions()[1][threadIdx.x+idxCopy];
+                    sourcesZ[threadIdx.x] = targets->getPositions()[2][threadIdx.x+idxCopy];
+                    sourcesPhys[threadIdx.x] = targets->getAttribute(0)[threadIdx.x+idxCopy];
                 }
 
                 __syncthreads();
 
                 if(threadCompute){
-                    const int leftCopies = Min(idxPart, nbCopies);
+                    int leftCopies = nbCopies;
+                    if(idxCopy <= idxPart && idxPart < idxCopy + nbCopies){
+                        leftCopies = idxPart - idxCopy;
+                    }
+
                     // Left Part
                     for(int otherIndex = 0; otherIndex < leftCopies - 3; otherIndex += 4) { // unrolling x4
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
                     }
 
-                    for(int otherIndex = (leftCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                    for(int otherIndex = (leftCopies/4) * 4; otherIndex < leftCopies; ++otherIndex) { // if nk%4 is not zero
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
                     }
                     // Right Part
                     for(int otherIndex = leftCopies+1; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
                     }
 
-                    for(int otherIndex = Max(leftCopies+1, (nbCopies/4) * 4); otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
-                        DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                    forceX, forceY, forceZ, potential,
-                                    sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                    for(int otherIndex = leftCopies+1 + ((nbCopies-(leftCopies+1))/4)*4 ; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                        DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                          forceX, forceY, forceZ, potential,
+                                          sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
                     }
                 }
 
@@ -157,86 +160,155 @@ public:
             }
 
             if( threadCompute ){
-                targets->getAttribute(1)[idxPart] += forceX;
-                targets->getAttribute(2)[idxPart] += forceY;
-                targets->getAttribute(3)[idxPart] += forceZ;
-                targets->getAttribute(4)[idxPart] += potential;
+                targets->getAttribute(1)[idxPart] += potential;
+                targets->getAttribute(2)[idxPart] += forceX;
+                targets->getAttribute(3)[idxPart] += forceY;
+                targets->getAttribute(4)[idxPart] += forceZ;
             }
 
+            __syncthreads();
         }
     }
 
     __device__ void P2PRemote(const int3& ,
                               ContainerClass* const  targets, const ContainerClass* const  /*sources*/,
-                              ContainerClass* const directNeighborsParticles[27], const int ){
-        for(int idxNeigh = 0 ; idxNeigh < 27 ; ++idxNeigh){
-            if(directNeighborsParticles[idxNeigh]){
-                const int nbLoops = (targets->getNbParticles()+blockDim.x-1)/blockDim.x;
-
-                for(int idxLoop = 0 ; idxLoop < nbLoops; ++idxLoop){
-                    const int idxPart = (idxLoop*blockDim.x+threadIdx.x);
-                    const bool threadCompute = (idxPart < targets->getNbParticles());
+                              ContainerClass* const directNeighborsParticles,
+                              const int* /*neighborsPositions*/, const int counter){
+        for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+
+            for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
+                const bool threadCompute = (idxPart < targets->getNbParticles());
+
+                FReal targetX, targetY, targetZ, targetPhys;
+                FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+
+                targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+                targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+                targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+                targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
+
+                for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
+                    __shared__ FReal sourcesX[SHARE_SIZE];
+                    __shared__ FReal sourcesY[SHARE_SIZE];
+                    __shared__ FReal sourcesZ[SHARE_SIZE];
+                    __shared__ FReal sourcesPhys[SHARE_SIZE];
+
+                    const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
+                    if(threadIdx.x < nbCopies){
+                        sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
+                        sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
+                        sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy];
+                        sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy];
+                    }
 
-                    FReal targetX, targetY, targetZ, targetPhys;
-                    FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+                    __syncthreads();
 
                     if(threadCompute){
-                        targetX = targets->getPositions()[0][idxPart];
-                        targetY = targets->getPositions()[1][idxPart];
-                        targetZ = targets->getPositions()[2][idxPart];
-                        targetPhys = targets->getAttribute(0)[idxPart];
-                    }
-
-                    for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh]->getNbParticles() ; idxCopy += SHARE_SIZE){
-                        __shared__ FReal sourcesX[SHARE_SIZE];
-                        __shared__ FReal sourcesY[SHARE_SIZE];
-                        __shared__ FReal sourcesZ[SHARE_SIZE];
-                        __shared__ FReal sourcesPhys[SHARE_SIZE];
-
-                        const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh]->getNbParticles()-idxCopy);
-                        if(threadIdx.x < nbCopies){
-                            sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[0][idxPart];
-                            sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[1][idxPart];
-                            sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh]->getPositions()[2][idxPart];
-                            sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh]->getAttribute(0)[idxPart];
+                        for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
                         }
 
-                        __syncthreads();
-
-                        if(threadCompute){
-                            for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
-                                DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                            forceX, forceY, forceZ, potential,
-                                            sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
-                                DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                            forceX, forceY, forceZ, potential,
-                                            sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
-                                DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                            forceX, forceY, forceZ, potential,
-                                            sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
-                                DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                            forceX, forceY, forceZ, potential,
-                                            sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
-                            }
-
-                            for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
-                                DirectMacro(targetX, targetY, targetZ, targetPhys,
-                                            forceX, forceY, forceZ, potential,
-                                            sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
-                            }
+                        for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
                         }
+                    }
+
+                    __syncthreads();
+                }
 
-                        __syncthreads();
+                if( threadCompute ){
+                    targets->getAttribute(1)[idxPart] += potential;
+                    targets->getAttribute(2)[idxPart] += forceX;
+                    targets->getAttribute(3)[idxPart] += forceY;
+                    targets->getAttribute(4)[idxPart] += forceZ;
+                }
+
+
+                __syncthreads();
+            }
+        }
+    }
+
+    __device__ void P2POuter(const int3& ,
+                             ContainerClass* const  targets,
+                             ContainerClass* const directNeighborsParticles,
+                             const int* /*neighborsPositions*/, const int counter){
+        for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+
+            for(int idxPart = threadIdx.x ; idxPart < targets->getNbParticles()+blockDim.x-1 ; idxPart += blockDim.x){
+                const bool threadCompute = (idxPart < targets->getNbParticles());
+
+                FReal targetX, targetY, targetZ, targetPhys;
+                FReal forceX = 0, forceY = 0, forceZ = 0, potential = 0;
+
+                targetX = (threadCompute? targets->getPositions()[0][idxPart] : 0);
+                targetY = (threadCompute? targets->getPositions()[1][idxPart] : 0);
+                targetZ = (threadCompute? targets->getPositions()[2][idxPart] : 0);
+                targetPhys = (threadCompute? targets->getAttribute(0)[idxPart] : 0);
+
+                for(int idxCopy = 0 ; idxCopy < directNeighborsParticles[idxNeigh].getNbParticles() ; idxCopy += SHARE_SIZE){
+                    __shared__ FReal sourcesX[SHARE_SIZE];
+                    __shared__ FReal sourcesY[SHARE_SIZE];
+                    __shared__ FReal sourcesZ[SHARE_SIZE];
+                    __shared__ FReal sourcesPhys[SHARE_SIZE];
+
+                    const int nbCopies = Min(SHARE_SIZE, directNeighborsParticles[idxNeigh].getNbParticles()-idxCopy);
+                    if(threadIdx.x < nbCopies){
+                        sourcesX[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[0][threadIdx.x+idxCopy];
+                        sourcesY[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[1][threadIdx.x+idxCopy];
+                        sourcesZ[threadIdx.x] = directNeighborsParticles[idxNeigh].getPositions()[2][threadIdx.x+idxCopy];
+                        sourcesPhys[threadIdx.x] = directNeighborsParticles[idxNeigh].getAttribute(0)[threadIdx.x+idxCopy];
                     }
 
-                    if( threadCompute ){
-                        targets->getAttribute(1)[idxPart] += forceX;
-                        targets->getAttribute(2)[idxPart] += forceY;
-                        targets->getAttribute(3)[idxPart] += forceZ;
-                        targets->getAttribute(4)[idxPart] += potential;
+                    __syncthreads();
+
+                    if(threadCompute){
+                        for(int otherIndex = 0; otherIndex < nbCopies - 3; otherIndex += 4) { // unrolling x4
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+1], sourcesY[otherIndex+1], sourcesZ[otherIndex+1], sourcesPhys[otherIndex+1]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+2], sourcesY[otherIndex+2], sourcesZ[otherIndex+2], sourcesPhys[otherIndex+2]);
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex+3], sourcesY[otherIndex+3], sourcesZ[otherIndex+3], sourcesPhys[otherIndex+3]);
+                        }
+
+                        for(int otherIndex = (nbCopies/4) * 4; otherIndex < nbCopies; ++otherIndex) { // if nk%4 is not zero
+                            DirectComputation(targetX, targetY, targetZ, targetPhys,
+                                              forceX, forceY, forceZ, potential,
+                                              sourcesX[otherIndex], sourcesY[otherIndex], sourcesZ[otherIndex], sourcesPhys[otherIndex]);
+                        }
                     }
 
+                    __syncthreads();
                 }
+
+                if( threadCompute ){
+                    targets->getAttribute(1)[idxPart] += potential;
+                    targets->getAttribute(2)[idxPart] += forceX;
+                    targets->getAttribute(3)[idxPart] += forceY;
+                    targets->getAttribute(4)[idxPart] += forceZ;
+                }
+
+                __syncthreads();
             }
         }
     }
@@ -250,7 +322,7 @@ public:
     }
 
     __host__ static dim3 GetGridSize(const int intervalSize){
-        return intervalSize;
+        return 1; //intervalSize;
     }
 
     __host__ static dim3 GetBlocksSize(){
diff --git a/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp
index 8c54c132ce7b33f26f09b4015c049e0878a0c45e..a538cb272487c557fbcd25df5217eb5c5292d323 100644
--- a/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp
+++ b/Src/GroupTree/StarPUUtils/FStarPUCpuWrapper.hpp
@@ -26,7 +26,7 @@
 #include <starpu.h>
 //}
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
 //extern "C"{
 #include <starpu_mpi.h>
 //}
@@ -184,7 +184,7 @@ public:
     /////////////////////////////////////////////////////////////////////////////////////
     /// Transfer Pass Mpi
     /////////////////////////////////////////////////////////////////////////////////////
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         CellContainerClass currentCells((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                                         STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
@@ -405,7 +405,7 @@ public:
     /// Direct Pass MPI
     /////////////////////////////////////////////////////////////////////////////////////
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         ParticleGroupClass containers((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                                       STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
diff --git a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
index 26c4a050f19eb411426305edd155b12a7f04aa26..5e438872e83e205ad5fdc3638c822eae068d6c2f 100644
--- a/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
+++ b/Src/GroupTree/StarPUUtils/FStarPUCudaWrapper.hpp
@@ -23,7 +23,7 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
 #include <starpu_mpi.h>
 #endif
 
@@ -96,21 +96,9 @@ public:
         FStarPUPtrInterface* worker = nullptr;
         int nbSubCellGroups = 0;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        FCudaParams<unsigned char*,9> subCellGroupsPtr;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
-        FCudaParams<std::size_t,9> subCellGroupsSize;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
-        FCudaParams<unsigned char*,9> subCellGroupsUpPtr;
-        memset(&subCellGroupsUpPtr, 0, sizeof(subCellGroupsUpPtr));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize.values[idxSubGroup] = STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]);
-            subCellGroupsUpPtr.values[idxSubGroup] = (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]);
-        }
-
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
         FCuda__upwardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
@@ -118,20 +106,22 @@ public:
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                 STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                 (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
-                subCellGroupsPtr,subCellGroupsSize,subCellGroupsUpPtr,
-                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
+                idxLevel, kernel, starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
 
     /////////////////////////////////////////////////////////////////////////////////////
     /// Transfer Pass Mpi
     /////////////////////////////////////////////////////////////////////////////////////
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        const std::vector<OutOfBlockInteraction>* outsideInteractions;
-        int intervalSize;
+        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -156,7 +146,7 @@ public:
     static void transferInPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -174,9 +164,10 @@ public:
     static void transferInoutPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
-        const std::vector<OutOfBlockInteraction>* outsideInteractions;
-        int intervalSize;
-        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
+        const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
+        int intervalSize = 0;
+        int mode = 0;
+        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
@@ -186,11 +177,9 @@ public:
                     STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
-                    STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]),
-                    (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[4]),
-                    (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[5]),
-                    idxLevel, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
+                    idxLevel, mode, outsideInteractions->data(), int(outsideInteractions->size()), kernel,
                     starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
@@ -202,21 +191,9 @@ public:
         FStarPUPtrInterface* worker = nullptr;
         int nbSubCellGroups = 0;
         int idxLevel = 0;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        FCudaParams<unsigned char*,9> subCellGroupsPtr;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsPtr));
-        FCudaParams<std::size_t,9> subCellGroupsSize;
-        memset(&subCellGroupsPtr, 0, sizeof(subCellGroupsSize));
-        FCudaParams<unsigned char*,9> subCellGroupsDownPtr;
-        memset(&subCellGroupsDownPtr, 0, sizeof(subCellGroupsDownPtr));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize.values[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsDownPtr.values[idxSubGroup] = ((unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[(idxSubGroup*2)+3]));
-        }
-
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
         FCuda__downardPassCallback< SymboleCellClass, PoleCellClass, LocalCellClass,
@@ -224,20 +201,22 @@ public:
                     (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[0]),
                 STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]),
                 (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[1]),
-                subCellGroupsPtr,subCellGroupsSize,subCellGroupsDownPtr,
-                nbSubCellGroups, idxLevel, kernel, starpu_cuda_get_local_stream(),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[2]),
+                STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]),
+                (unsigned char*)STARPU_VARIABLE_GET_PTR(buffers[3]),
+                idxLevel, kernel, starpu_cuda_get_local_stream(),
                 FCuda__GetGridSize(kernel,intervalSize),FCuda__GetBlockSize(kernel));
     }
     /////////////////////////////////////////////////////////////////////////////////////
     /// Direct Pass MPI
     /////////////////////////////////////////////////////////////////////////////////////
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){
 
         FStarPUPtrInterface* worker = nullptr;
         const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
@@ -260,7 +239,7 @@ public:
 
     static void directInPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &intervalSize);
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
 
@@ -276,7 +255,7 @@ public:
     static void directInoutPassCallback(void *buffers[], void *cl_arg){
         FStarPUPtrInterface* worker = nullptr;
         const std::vector<OutOfBlockInteraction>* outsideInteractions = nullptr;
-        int intervalSize;
+        int intervalSize = 0;
         starpu_codelet_unpack_args(cl_arg, &worker, &outsideInteractions, &intervalSize);
 
         CudaKernelClass* kernel = worker->get<ThisClass>(FSTARPU_CPU_IDX)->kernels[starpu_worker_get_id()];
diff --git a/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp b/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp
index c7d181d61f8cf6d7fec262eff775efc576323539..2620e4d072f8239e40624df5ab5892b88228d553 100644
--- a/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp
+++ b/Src/GroupTree/StarPUUtils/FStarPUOpenClWrapper.hpp
@@ -25,7 +25,7 @@
 
 #include <starpu.h>
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
 #include <starpu_mpi.h>
 #endif
 
@@ -100,22 +100,13 @@ public:
         int intervalSize;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        cl_mem subCellGroupsPtr[9];
-        memset(subCellGroupsPtr, 0, 9*sizeof(cl_mem));
-        cl_mem subCellGroupsUpPtr[9];
-        memset(subCellGroupsUpPtr, 0, 9*sizeof(cl_mem));
-        size_t subCellGroupsSize[9];
-        memset(subCellGroupsSize, 0, 9*sizeof(size_t));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsUpPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+3]));
-        }
+        cl_mem otherCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2]));
+        size_t otherCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]);
+        cl_mem otherCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3]));
 
         OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()];
         kernel->upwardPassPerform(currentCellsPtr, currentCellsSize, currentCellsUpPtr,
-                                  subCellGroupsPtr, subCellGroupsSize, subCellGroupsUpPtr,
-                                  nbSubCellGroups, idxLevel,
+                                  otherCellsPtr, otherCellsSize, otherCellsUpPtr, idxLevel,
                                   intervalSize);
     }
 
@@ -123,7 +114,7 @@ public:
     /////////////////////////////////////////////////////////////////////////////////////
     /// Transfer Pass Mpi
     /////////////////////////////////////////////////////////////////////////////////////
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void transferInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         cl_mem currentCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0]));
         size_t currentCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]);
@@ -180,19 +171,18 @@ public:
     static void transferInoutPassCallback(void *buffers[], void *cl_arg){
         cl_mem currentCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0]));
         size_t currentCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]);
-        cl_mem currentCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[1]));
-        cl_mem currentCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2]));
+        cl_mem currentCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[1]));
 
-        cl_mem externalCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3]));
-        size_t externalCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[3]);
-        cl_mem externalCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[4]));
-        cl_mem externalCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[5]));
+        cl_mem externalCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2]));
+        size_t externalCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]);
+        cl_mem externalCellsUpPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3]));
 
         FStarPUPtrInterface* worker = nullptr;
         int idxLevel = 0;
         const std::vector<OutOfBlockInteraction>* outsideInteractions;
         int intervalSize;
-        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize);
+        int mode = 0;
+        starpu_codelet_unpack_args(cl_arg, &worker, &idxLevel, &outsideInteractions, &intervalSize, &mode);
 
         OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()];
         cl_int errcode_ret;
@@ -202,9 +192,9 @@ public:
            const_cast<OutOfBlockInteraction*>(outsideInteractions->data()), &errcode_ret);
         FAssertLF(outsideInteractionsCl && errcode_ret == CL_SUCCESS);
 
-        kernel->transferInoutPassPerform(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr,
-                                         externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr,
-                                         idxLevel, outsideInteractionsCl, outsideInteractions->size(),
+        kernel->transferInoutPassPerform(currentCellsPtr, currentCellsSize, currentCellsDownPtr,
+                                         externalCellsPtr, externalCellsSize, externalCellsUpPtr,
+                                         idxLevel, mode, outsideInteractionsCl, outsideInteractions->size(),
                                          intervalSize);
 
         clReleaseMemObject(outsideInteractionsCl);
@@ -225,22 +215,13 @@ public:
         int intervalSize;
         starpu_codelet_unpack_args(cl_arg, &worker, &nbSubCellGroups, &idxLevel, &intervalSize);
 
-        cl_mem subCellGroupsPtr[9];
-        memset(subCellGroupsPtr, 0, 9*sizeof(cl_mem));
-        cl_mem subCellGroupsDownPtr[9];
-        memset(subCellGroupsDownPtr, 0, 9*sizeof(cl_mem));
-        size_t subCellGroupsSize[9];
-        memset(subCellGroupsSize, 0, 9*sizeof(size_t));
-        for(int idxSubGroup = 0; idxSubGroup < nbSubCellGroups ; ++idxSubGroup){
-            subCellGroupsPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsSize[idxSubGroup] = (STARPU_VARIABLE_GET_ELEMSIZE(buffers[(idxSubGroup*2)+2]));
-            subCellGroupsDownPtr[idxSubGroup] = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[(idxSubGroup*2)+3]));
-        }
+        cl_mem otherCellsPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[2]));
+        size_t otherCellsSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[2]);
+        cl_mem otherCellsDownPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[3]));
 
         OpenCLKernelClass* kernel = worker->get<ThisClass>(FSTARPU_OPENCL_IDX)->kernels[starpu_worker_get_id()];
         kernel->downardPassPerform(currentCellsPtr, currentCellsSize, currentCellsDownPtr,
-                                   subCellGroupsPtr, subCellGroupsSize, subCellGroupsDownPtr,
-                                   nbSubCellGroups, idxLevel,
+                                   otherCellsPtr, otherCellsSize, otherCellsDownPtr, idxLevel,
                                    intervalSize);
     }
 
@@ -248,7 +229,7 @@ public:
     /// Direct Pass MPI
     /////////////////////////////////////////////////////////////////////////////////////
 
-#ifdef STARPU_USE_MPI
+#if defined(STARPU_USE_MPI) && defined(SCALFMM_USE_MPI)
     static void directInoutPassCallbackMpi(void *buffers[], void *cl_arg){
         cl_mem containersPtr = ((cl_mem)STARPU_VARIABLE_GET_DEV_HANDLE(buffers[0]));
         size_t containersSize = STARPU_VARIABLE_GET_ELEMSIZE(buffers[0]);
diff --git a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
index 23325ea829ea174b4d1fa7946d24895c2e1b4259..a4d77289aca71346d3941447cc836f0d86cef87d 100644
--- a/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
+++ b/Src/GroupTree/TestKernel/FCudaTestKernels.hpp
@@ -34,13 +34,12 @@ public:
     }
 
     /** Before Downward */
-    __device__ void M2L(CellClass  local, const CellClass distantNeighbors[343], const int /*size*/, const int /*level*/) {
+    __device__ void M2L(CellClass  local, const CellClass* distantNeighbors,
+                const int* /*neighPositions*/, const int size, const int /*level*/) {
         if(threadIdx.x == 0) {
             // The pole is impacted by what represent other poles
-            for(int idx = 0 ; idx < 343 ; ++idx){
-                if(distantNeighbors[idx].symb){
-                    *local.down += *distantNeighbors[idx].up;
-                }
+            for(int idx = 0 ; idx < size ; ++idx){
+                *local.down += *distantNeighbors[idx].up;
             }
         }
     }
@@ -71,18 +70,18 @@ public:
 
     /** After Downward */
     __device__ void P2P(const int3& ,
-                 ContainerClass* const  targets, const ContainerClass* const  sources,
-                 ContainerClass* const directNeighborsParticles[27], const int ){
+                        ContainerClass* const  targets, const ContainerClass* const  sources,
+                        ContainerClass* const directNeighborsParticles,
+                        const int* /*neighborPositions*/,
+                        const int counter){
         if(threadIdx.x == 0) {
             // Each particles targeted is impacted by the particles sources
             long long int inc = sources->getNbParticles();
             if(targets == sources){
                 inc -= 1;
             }
-            for(int idx = 0 ; idx < 27 ; ++idx){
-                if( directNeighborsParticles[idx] ){
-                    inc += directNeighborsParticles[idx]->getNbParticles();
-                }
+            for(int idx = 0 ; idx < counter ; ++idx){
+                inc += directNeighborsParticles[idx].getNbParticles();
             }
 
             long long int*const particlesAttributes = targets->template getAttribute<0>();
@@ -94,15 +93,35 @@ public:
 
     /** After Downward */
     __device__ void P2PRemote(const int3& ,
-                 ContainerClass* const  targets, const ContainerClass* const  sources,
-                 ContainerClass* const directNeighborsParticles[27], const int ){
+                              ContainerClass* const  targets,
+                              const ContainerClass* const  sources,
+                              ContainerClass* const directNeighborsParticles,
+                              const int* /*neighborPositions*/,
+                              const int counter){
         if(threadIdx.x == 0) {
             // Each particles targeted is impacted by the particles sources
             long long int inc = 0;
-            for(int idx = 0 ; idx < 27 ; ++idx){
-                if( directNeighborsParticles[idx] ){
-                    inc += directNeighborsParticles[idx]->getNbParticles();
-                }
+            for(int idx = 0 ; idx < counter ; ++idx){
+                inc += directNeighborsParticles[idx].getNbParticles();
+            }
+
+            long long int*const particlesAttributes = targets->template getAttribute<0>();
+            for(FSize idxPart = 0 ; idxPart < targets->getNbParticles() ; ++idxPart){
+                particlesAttributes[idxPart] += inc;
+            }
+        }
+    }
+
+    __device__ void P2POuter(const int3& ,
+                             ContainerClass* const  targets,
+                             ContainerClass* const directNeighborsParticles,
+                             const int* /*neighborPositions*/,
+                             const int counter){
+        if(threadIdx.x == 0) {
+            // Each particles targeted is impacted by the particles sources
+            long long int inc = 0;
+            for(int idx = 0 ; idx < counter ; ++idx){
+                inc += directNeighborsParticles[idx].getNbParticles();
             }
 
             long long int*const particlesAttributes = targets->template getAttribute<0>();
diff --git a/Src/GroupTree/TestKernel/FTestKernel.cl b/Src/GroupTree/TestKernel/FTestKernel.cl
index c09971e1146526ecf2c3b6e5694efa310f580e69..287b5d974b8776844c7bc43379f6697a43a46e84 100644
--- a/Src/GroupTree/TestKernel/FTestKernel.cl
+++ b/Src/GroupTree/TestKernel/FTestKernel.cl
@@ -51,6 +51,7 @@ struct OutOfBlockInteraction{
     MortonIndex insideIndex;
     int relativeOutPosition;
     int insideIdxInBlock;
+    int outsideIdxInBlock;
 } __attribute__ ((aligned (DefaultStructAlign)));
 
 #define Between(inValue, inMin, inMax)  ( (inMin) <= (inValue) && (inValue) < (inMax) )
@@ -560,12 +561,10 @@ void M2M(struct FWrappeCell  pole, struct FWrappeCell child[8], const int level,
     }
 }
 
-void M2L(struct FWrappeCell const pole, const struct FWrappeCell distantNeighbors[343],
-const int size, const int level, __global void* user_data) {
-    for(int idxNeigh = 0 ; idxNeigh < 343 ; ++idxNeigh){
-        if(distantNeighbors[idxNeigh].symb){
-            *pole.down += *distantNeighbors[idxNeigh].up;
-        }
+void M2L(struct FWrappeCell const pole, const struct FWrappeCell* distantNeighbors,
+         const int* relativePositions, const int size, const int level, __global void* user_data) {
+    for(int idxNeigh = 0 ; idxNeigh < size ; ++idxNeigh){
+        *pole.down += *distantNeighbors[idxNeigh].up;
     }
 }
 
@@ -610,6 +609,15 @@ void P2PRemote(const int3 pos,
     }
 }
 
+void P2POuter(const int3 pos,
+               struct FOpenCLGroupAttachedLeaf  targets, const struct FOpenCLGroupAttachedLeaf  sources,
+               struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){
+    __global long long* partdown = targets.attributes[0];
+    for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){
+        partdown[idxPart] += directNeighborsParticles.nbParticles;
+    }
+}
+
 int3 getCoordinate(const struct FWrappeCell cell) {
     int3 coord;
     coord.x = cell.symb->coordinates[0];
@@ -659,49 +667,43 @@ __kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, s
 /////////////////////////////////////////////////////////////////////////////////////
 
 __kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr,
-                                         struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsUpPtr,
-                                         int nbSubCellGroups, int idxLevel, __global void* userkernel){
+                                         __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr,
+                                         int idxLevel, __global void* userkernel){
     struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR);
-    struct FOpenCLGroupOfCells subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx] = BuildFOpenCLGroupOfCells(subCellGroupsPtr.ptrs[idx], subCellGroupsSize.v[idx], subCellGroupsUpPtr.ptrs[idx], NULLPTR);
-    }
-
-    FOpenCLAssertLF(nbSubCellGroups != 0);
     const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&currentCells);
-    int idxSubCellGroup = 0;
-    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&subCellGroups[0], FOpenCLGroupOfCells_getCellIndex(&currentCells, 0));
+    struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, childCellsUpPtr, NULLPTR);
+    const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells);
 
-    for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){
-        struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&currentCells, idxCell);
-        FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxCell));
-        struct FWrappeCell child[8];
+    const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(&currentCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3);
+    const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(&currentCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3);
 
-        FOpenCLAssertLF(idxSubCellGroup != nbSubCellGroups);
+    int idxParentCell = FOpenCLGroupOfCells_getCellIndex(&currentCells,firstParent);
+    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent);
 
+    while(true){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&currentCells, idxParentCell);
+        struct FWrappeCell child[8];
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = NULLPTR;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)) & 7);
-
-            child[idxChild] = FOpenCLGroupOfCells_getUpCell(&subCellGroups[idxSubCellGroup], idxChildCell);
+        do{
+            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7);
+            child[idxChild] = FOpenCLGroupOfCells_getUpCell(&childCells, idxChildCell);
             idxChildCell += 1;
+        }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3));
 
-            if(idxChildCell == FOpenCLGroupOfCells_getNumberOfCellsInBlock(&subCellGroups[idxSubCellGroup])){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
+        M2M(cell, child, idxLevel, userkernel);
+
+        if(FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxParentCell) == lastParent){
+            break;
         }
 
-        M2M(cell, child, idxLevel, userkernel);
+        idxParentCell += 1;
     }
 }
 
 
-
 /////////////////////////////////////////////////////////////////////////////////////
 /// Transfer Pass Mpi
 /////////////////////////////////////////////////////////////////////////////////////
@@ -723,11 +725,9 @@ __kernel  void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* curr
             struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
             FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
 
-            struct FWrappeCell interactions[343];
-            FSetToNullptr343(interactions);
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell;
-            const int counter = 1;
-            M2L( cell , interactions, counter, idxLevel, userkernel);
+            const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition;
+            M2L( cell , &interCell, &relativeOutPosition,
+                 1, idxLevel, userkernel);
         }
     }
 }
@@ -766,47 +766,50 @@ __kernel  void FOpenCL__transferInPassPerform(__global unsigned char* currentCel
                 const int cellPos = FOpenCLGroupOfCells_getCellIndex(&currentCells, interactionsIndexes[idxInter]);
                 if(cellPos != -1){
                     struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&currentCells, cellPos);
-                    FOpenCLAssertLF(interCell.symb->mortonIndex == interactionsIndexes[idxInter]);
-                    FOpenCLAssertLF(interactions[interactionsPosition[idxInter]].symb == NULLPTR);
-                    interactions[interactionsPosition[idxInter]] = interCell;
+                    interactions[counterExistingCell] = interCell;
+                    interactionsPosition[counterExistingCell] = interactionsPosition[idxInter];
                     counterExistingCell += 1;
                 }
             }
         }
 
-        M2L( cell , interactions, counterExistingCell, idxLevel, userkernel);
+        M2L( cell , interactions, interactionsPosition,
+             counterExistingCell, idxLevel, userkernel);
     }
 }
 
 
 
 __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,
-                                                __global unsigned char*  currentCellsUpPtr, __global unsigned char*  currentCellsDownPtr,
+                                                __global unsigned char*  currentCellsUpPtr,
                                                 __global unsigned char* externalCellsPtr, size_t externalCellsSize,
-                                                __global unsigned char* externalCellsUpPtr, __global unsigned char* externalCellsDownPtr,
-                                                int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,
+                                                __global unsigned char* externalCellsDownPtr,
+                                                int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions,
                                                 size_t nbOutsideInteractions, __global void* userkernel){
-    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
-    struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, externalCellsUpPtr, externalCellsDownPtr);
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR);
+    struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, NULLPTR, externalCellsDownPtr);
 
-    for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
-        const int cellPos = FOpenCLGroupOfCells_getCellIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex);
-        if(cellPos != -1){
-            FOpenCLAssertLF(outsideInteractions[outInterIdx].outIndex == FOpenCLGroupOfCells_getCellMortonIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex));
-            struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, cellPos);
+    if(mode == 1){
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
             FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
             struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
             FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
 
-            struct FWrappeCell interactions[343];
-            FSetToNullptr343(interactions);
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition] = interCell;
-            const int counter = 1;
-            M2L( cell , interactions, counter, idxLevel, userkernel);
+            const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition;
+            M2L( cell , &interCell, &relativeOutPosition,
+                 1, idxLevel, userkernel);
+        }
+    }
+    else{
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            struct FWrappeCell interCell = FOpenCLGroupOfCells_getDownCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
+            FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
+            struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
+            FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
 
-            interactions[outsideInteractions[outInterIdx].relativeOutPosition].symb = NULLPTR;
-            interactions[FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition)] = cell;
-            M2L( interCell , interactions, counter, idxLevel, userkernel);
+            const int relativepos = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
+            M2L( interCell , &cell, &relativepos, 1, idxLevel, userkernel);
         }
     }
 }
@@ -819,46 +822,39 @@ __kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentC
 
 
 __kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,
-                                          struct Uptr9 subCellGroupsPtr, struct size_t9 subCellGroupsSize, struct Uptr9 subCellGroupsDownPtr,
-                                          int nbSubCellGroups, int idxLevel, __global void* userkernel){
-    FOpenCLAssertLF(nbSubCellGroups != 0);
+                                          __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr,
+                                          int idxLevel, __global void* userkernel){
     struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr);
-    struct FOpenCLGroupOfCells subCellGroups[9];
-    for(int idx = 0 ; idx < nbSubCellGroups ; ++idx){
-        subCellGroups[idx] = BuildFOpenCLGroupOfCells(subCellGroupsPtr.ptrs[idx], subCellGroupsSize.v[idx], NULLPTR, subCellGroupsDownPtr.ptrs[idx]);
-    }
-
-
-    FOpenCLAssertLF(nbSubCellGroups != 0);
     const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&currentCells);
-    int idxSubCellGroup = 0;
-    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&subCellGroups[0], FOpenCLGroupOfCells_getCellIndex(&currentCells, 0));
+    struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, NULLPTR, childCellsDownPtr);
+    const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells);
 
-    for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){
-        struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, idxCell);
-        FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxCell));
-        struct FWrappeCell child[8];
+    const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(&currentCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3);
+    const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(&currentCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3);
 
-        FOpenCLAssertLF(idxSubCellGroup != nbSubCellGroups);
+    int idxParentCell = FOpenCLGroupOfCells_getCellIndex(&currentCells,firstParent);
+    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent);
 
+    while(true){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, idxParentCell);
+        struct FWrappeCell child[8];
         for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
             child[idxChild].symb = NULLPTR;
         }
 
-        while(idxSubCellGroup != nbSubCellGroups
-              && (FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)>>3) == cell.symb->mortonIndex){
-            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&subCellGroups[idxSubCellGroup], idxChildCell)) & 7);
-
-            child[idxChild] = FOpenCLGroupOfCells_getDownCell(&subCellGroups[idxSubCellGroup], idxChildCell);
+        do{
+            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7);
+            child[idxChild] = FOpenCLGroupOfCells_getDownCell(&childCells, idxChildCell);
             idxChildCell += 1;
+        }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3));
 
-            if(idxChildCell == FOpenCLGroupOfCells_getNumberOfCellsInBlock(&subCellGroups[idxSubCellGroup])){
-                idxChildCell = 0;
-                idxSubCellGroup += 1;
-            }
+        L2L(cell, child, idxLevel, userkernel);
+
+        if(FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxParentCell) == lastParent){
+            break;
         }
 
-        L2L(cell, child, idxLevel, userkernel);
+        idxParentCell += 1;
     }
 }
 
@@ -884,7 +880,8 @@ __kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* contain
             struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock);
             FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex);
 
-            P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel);
+            P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles ,
+                       interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel);
         }
     }
 }
@@ -919,7 +916,7 @@ __kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr
         for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
             if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){
                 const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containers, interactionsIndexes[idxInter]);
-                if(leafPos){
+                if(leafPos != -1){
                     FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, leafPos) == interactionsIndexes[idxInter]);
                     interactionsObjects[counterExistingCell] = FOpenCLGroupOfParticles_getLeaf(&containers, leafPos);
                     neighPosition[counterExistingCell] = interactionsPosition[idxInter];
@@ -944,16 +941,18 @@ __kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containers
     for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
         const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex);
         if(leafPos != -1){
-            FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containersOther, leafPos) == outsideInteractions[outInterIdx].outIndex);
-            struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, leafPos);
+            struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
             struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock);
+
             FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex);
             FOpenCLAssertLF(particles.nbParticles);
             FOpenCLAssertLF(interParticles.nbParticles);
 
-            P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles , interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel );
+            P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles ,
+                      interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel );
 
-            P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles , particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel);
+            P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles ,
+                      particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel);
         }
     }
 }
diff --git a/Src/GroupTree/Uniform/FUniformKernel.cl b/Src/GroupTree/Uniform/FUniformKernel.cl
new file mode 100644
index 0000000000000000000000000000000000000000..858e4bb3e50ba93623bb4a7cede912bd9c6b2f6a
--- /dev/null
+++ b/Src/GroupTree/Uniform/FUniformKernel.cl
@@ -0,0 +1,986 @@
+/** This file contains the prototype for a kernel in opencl */
+// @SCALFMM_PRIVATE
+
+
+/***************************************************************************/
+/***************************************************************************/
+/************************CHANGE THINGS HERE*********************************/
+/***************************************************************************/
+
+typedef ___FSize___ FSize;
+typedef ___FReal___ FReal;
+typedef ___FParticleValueClass___ FParticleValueClass;
+typedef long long int MortonIndex;
+
+#define FOpenCLGroupOfCellsCellIsEmptyFlag  ((MortonIndex)-1)
+
+#define NbAttributesPerParticle ___NbAttributesPerParticle___
+#define NbSymbAttributes ___NbSymbAttributes___
+
+#define FOpenCLGroupOfParticlesMemoryAlignementBytes  ___FP2PDefaultAlignement___
+#define FOpenCLGroupOfParticlesMemoryAlignementParticles (FOpenCLGroupOfParticlesMemoryAlignementBytes/sizeof(FReal))
+#define FOpenCLGroupOfParticlesLeafIsEmptyFlag ((MortonIndex)-1)
+
+#define NULLPTR (0)
+
+#define DefaultStructAlign ___DefaultStructAlign___
+
+struct FSymboleCellClass {
+    MortonIndex mortonIndex;
+    int coordinates[3];
+} __attribute__ ((aligned (DefaultStructAlign)));
+
+typedef FReal FPoleCellClass;
+typedef FReal FLocalCellClass;
+
+struct FWrappeCell{
+    __global struct FSymboleCellClass* symb;
+    __global FPoleCellClass* up;
+    __global FLocalCellClass* down;
+};
+
+#define ORDER __ORDER__
+#define POLE_SIZE __POLE_SIZE__
+#define LOCAL_SIZE __LOCAL_SIZE__
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+struct OutOfBlockInteraction{
+    MortonIndex outIndex;
+    MortonIndex insideIndex;
+    int relativeOutPosition;
+    int insideIdxInBlock;
+    int outsideIdxInBlock;
+} __attribute__ ((aligned (DefaultStructAlign)));
+
+#define Between(inValue, inMin, inMax)  ( (inMin) <= (inValue) && (inValue) < (inMax) )
+#define pow2(power)  (1 << (power))
+#define Abs(inV) (inV < 0 ? -inV : inV)
+
+int3 GetPositionFromMorton(MortonIndex inIndex, const int inLevel){
+    MortonIndex mask = 0x1LL;
+
+    int3 coord;
+    coord.x = 0;
+    coord.y = 0;
+    coord.z = 0;
+
+    for(int indexLevel = 0; indexLevel < inLevel ; ++indexLevel){
+        coord.z |= (int)(inIndex & mask);
+        inIndex >>= 1;
+        coord.y |= (int)(inIndex & mask);
+        inIndex >>= 1;
+        coord.x |= (int)(inIndex & mask);
+
+        mask <<= 1;
+    }
+
+    return coord;
+}
+
+MortonIndex GetMortonIndex(const int3 coord, const int inLevel) {
+    MortonIndex index = 0x0LL;
+    MortonIndex mask = 0x1LL;
+    // the ordre is xyz.xyz...
+    MortonIndex mx = coord.x << 2;
+    MortonIndex my = coord.y << 1;
+    MortonIndex mz = coord.z;
+
+    for(int indexLevel = 0; indexLevel < inLevel ; ++indexLevel){
+        index |= (mz & mask);
+        mask <<= 1;
+        index |= (my & mask);
+        mask <<= 1;
+        index |= (mx & mask);
+        mask <<= 1;
+
+        mz <<= 2;
+        my <<= 2;
+        mx <<= 2;
+    }
+
+    return index;
+}
+
+int GetNeighborsIndexes(const int3 coord, const int OctreeHeight, MortonIndex indexes[26], int indexInArray[26]) {
+    int idxNeig = 0;
+    int limite = 1 << (OctreeHeight - 1);
+    // We test all cells around
+    for(int idxX = -1 ; idxX <= 1 ; ++idxX){
+        if(!Between(coord.x + idxX,0, limite)) continue;
+
+        for(int idxY = -1 ; idxY <= 1 ; ++idxY){
+            if(!Between(coord.y + idxY,0, limite)) continue;
+
+            for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){
+                if(!Between(coord.z + idxZ,0, limite)) continue;
+
+                // if we are not on the current cell
+                if( idxX || idxY || idxZ ){
+                    int3 other;
+
+                    other.x = coord.x + idxX;
+                    other.y = coord.y + idxY;
+                    other.z = coord.z + idxZ;
+
+                    indexes[ idxNeig ] = GetMortonIndex(other, OctreeHeight - 1);
+                    indexInArray[ idxNeig ] = ((idxX+1)*3 + (idxY+1)) * 3 + (idxZ+1);
+                    ++idxNeig;
+                }
+            }
+        }
+    }
+    return idxNeig;
+}
+
+int GetInteractionNeighbors(const int3 coord, const int inLevel, MortonIndex inNeighbors[189], int inNeighborsPosition[189]) {
+    // Then take each child of the parent's neighbors if not in directNeighbors
+    // Father coordinate
+    int3 parentCell;
+    parentCell.x = coord.x>>1;
+    parentCell.y = coord.y>>1;
+    parentCell.z = coord.z>>1;
+
+    // Limite at parent level number of box (split by 2 by level)
+    const int limite = pow2(inLevel-1);
+
+    int idxNeighbors = 0;
+    // We test all cells around
+    for(int idxX = -1 ; idxX <= 1 ; ++idxX){
+        if(!Between(parentCell.x + idxX,0,limite)) continue;
+
+        for(int idxY = -1 ; idxY <= 1 ; ++idxY){
+            if(!Between(parentCell.y + idxY,0,limite)) continue;
+
+            for(int idxZ = -1 ; idxZ <= 1 ; ++idxZ){
+                if(!Between(parentCell.z + idxZ,0,limite)) continue;
+
+                // if we are not on the current cell
+                if( idxX || idxY || idxZ ){
+                    int3 otherParent;
+
+                    otherParent.x = parentCell.x + idxX;
+                    otherParent.y = parentCell.y + idxY;
+                    otherParent.z = parentCell.z + idxZ;
+
+                    const MortonIndex mortonOther = GetMortonIndex(otherParent, inLevel-1);
+
+                    // For each child
+                    for(int idxCousin = 0 ; idxCousin < 8 ; ++idxCousin){
+                        const int xdiff  = ((otherParent.x<<1) | ( (idxCousin>>2) & 1)) - coord.x;
+                        const int ydiff  = ((otherParent.y<<1) | ( (idxCousin>>1) & 1)) - coord.y;
+                        const int zdiff  = ((otherParent.z<<1) | (idxCousin&1)) - coord.z;
+
+                        // Test if it is a direct neighbor
+                        if(Abs(xdiff) > 1 || Abs(ydiff) > 1 || Abs(zdiff) > 1){
+                            // add to neighbors
+                            inNeighborsPosition[idxNeighbors] = ((( (xdiff+3) * 7) + (ydiff+3))) * 7 + zdiff + 3;
+                            inNeighbors[idxNeighbors++] = (mortonOther << 3) | idxCousin;
+                        }
+                    }
+                }
+            }
+        }
+    }
+
+    return idxNeighbors;
+}
+
+
+void FSetToNullptr343(struct FWrappeCell ptrs[343]){
+    int idx;
+    for( idx = 0 ; idx < 343 ; ++idx){
+        ptrs[idx].symb = NULLPTR;
+    }
+}
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+
+struct FOpenCLGroupAttachedLeaf {
+    //< Nb of particles in the current leaf
+    FSize nbParticles;
+    //< Pointers to the positions of the particles
+    __global FReal* positionsPointers[3];
+    //< Pointers to the attributes of the particles
+    __global FParticleValueClass* attributes[NbSymbAttributes+NbAttributesPerParticle];
+};
+
+struct FOpenCLGroupAttachedLeaf BuildFOpenCLGroupAttachedLeaf(const FSize inNbParticles, __global FReal* inPositionBuffer, const size_t inLeadingPosition,
+                                                              __global FParticleValueClass* inAttributesBuffer, const size_t inLeadingAttributes){
+    struct FOpenCLGroupAttachedLeaf leaf;
+    leaf.nbParticles = (inNbParticles);
+    // Redirect pointers to position
+    leaf.positionsPointers[0] = inPositionBuffer;
+    leaf.positionsPointers[1] = (__global FReal*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition);
+    leaf.positionsPointers[2] = (__global FReal*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition*2);
+
+    for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){
+        leaf.attributes[idxAttribute] =  (__global FParticleValueClass*)(((__global unsigned char*)inPositionBuffer) + inLeadingPosition*(idxAttribute+3));
+    }
+
+    // Redirect pointers to data
+    if(inAttributesBuffer){
+        for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
+            leaf.attributes[idxAttribute+NbSymbAttributes] = (__global FParticleValueClass*)(((__global unsigned char*)inAttributesBuffer) + idxAttribute*inLeadingAttributes);
+        }
+    }
+    else{
+        for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
+            leaf.attributes[idxAttribute+NbSymbAttributes] = NULLPTR;
+        }
+    }
+    return leaf;
+}
+
+struct FOpenCLGroupAttachedLeaf EmptyFOpenCLGroupAttachedLeaf(){
+    struct FOpenCLGroupAttachedLeaf leaf;
+    leaf.nbParticles = -1;
+    // Redirect pointers to position
+    leaf.positionsPointers[0] = NULLPTR;
+    leaf.positionsPointers[1] = NULLPTR;
+    leaf.positionsPointers[2] = NULLPTR;
+
+    // Redirect pointers to data
+    for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes+NbAttributesPerParticle ; ++idxAttribute){
+        leaf.attributes[idxAttribute] = NULLPTR;
+    }
+    return leaf;
+}
+
+bool FOpenCLGroupAttachedLeaf_isAttachedToSomething(const struct FOpenCLGroupAttachedLeaf* group){
+    return (group->nbParticles != -1);
+}
+bool FOpenCLGroupAttachedLeaf_getNbParticles(const struct FOpenCLGroupAttachedLeaf* group){
+    return (group->nbParticles);
+}
+
+
+/** One header is allocated at the beginning of each block */
+struct FOpenCLGroupOfParticlesBlockHeader{
+    MortonIndex startingIndex;
+    MortonIndex endingIndex;
+    int numberOfLeavesInBlock;
+
+    //< The real number of particles allocated
+    FSize nbParticlesAllocatedInGroup;
+    //< Starting point of position
+    size_t offsetPosition;
+    //< Bytes difference/offset between position
+    size_t positionsLeadingDim;
+    //< Bytes difference/offset between attributes
+    size_t attributeLeadingDim;
+    //< The total number of particles in the group
+    FSize nbParticlesInGroup;
+}__attribute__ ((aligned (DefaultStructAlign)));
+
+/** Information about a leaf */
+struct FOpenCLGroupOfParticlesLeafHeader {
+    MortonIndex mindex;
+    FSize nbParticles;
+    size_t offSet;
+}__attribute__ ((aligned (DefaultStructAlign)));
+
+
+struct FOpenCLGroupOfParticles {
+    //< The size of memoryBuffer in byte
+    size_t allocatedMemoryInByte;
+    //< Pointer to a block memory
+    __global unsigned char* memoryBuffer;
+
+    //< Pointer to the header inside the block memory
+    __global struct FOpenCLGroupOfParticlesBlockHeader*    blockHeader;
+    //< Pointer to leaves information
+    __global struct FOpenCLGroupOfParticlesLeafHeader*     leafHeader;
+    //< The total number of particles in the group
+    const FSize nbParticlesInGroup;
+
+    //< Pointers to particle position x, y, z
+    __global FReal* particlePosition[3];
+
+    //< Pointers to the particles data inside the block memory
+    __global FParticleValueClass*      attributesBuffer;
+    __global FParticleValueClass*      particleAttributes[NbSymbAttributes+NbAttributesPerParticle];
+};
+
+struct FOpenCLGroupOfParticles BuildFOpenCLGroupOfParticles(__global unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
+                                                            __global unsigned char* inAttributeBuffer){
+    struct FOpenCLGroupOfParticles group;
+    group.allocatedMemoryInByte = (inAllocatedMemoryInByte);
+    group.memoryBuffer = (inBuffer);
+
+    // Move the pointers to the correct position
+    group.blockHeader         = ((__global struct FOpenCLGroupOfParticlesBlockHeader*)inBuffer);
+    inBuffer += sizeof(struct FOpenCLGroupOfParticlesBlockHeader);
+    group.leafHeader          = ((__global struct FOpenCLGroupOfParticlesLeafHeader*)inBuffer);
+
+    // Init particle pointers
+    // Assert group.blockHeader->positionsLeadingDim == (sizeof(FReal) * group.blockHeader->nbParticlesAllocatedInGroup);
+    group.particlePosition[0] = (__global FReal*) (group.memoryBuffer + group.blockHeader->offsetPosition);
+    group.particlePosition[1] = (group.particlePosition[0] + group.blockHeader->nbParticlesAllocatedInGroup);
+    group.particlePosition[2] = (group.particlePosition[1] + group.blockHeader->nbParticlesAllocatedInGroup);
+
+    // Redirect pointer to data
+    // Assert group.blockHeader->attributeLeadingDim == (sizeof(FParticleValueClass) * group.blockHeader->nbParticlesAllocatedInGroup);
+    __global unsigned char* previousPointer = ((__global unsigned char*)(group.particlePosition[2] + group.blockHeader->nbParticlesAllocatedInGroup));
+    for(unsigned idxAttribute = 0 ; idxAttribute < NbSymbAttributes ; ++idxAttribute){
+        group.particleAttributes[idxAttribute] = ((__global FParticleValueClass*)previousPointer);
+        previousPointer += sizeof(FParticleValueClass)*group.blockHeader->nbParticlesAllocatedInGroup;
+    }
+
+    if(inAttributeBuffer){
+        group.attributesBuffer = (__global FParticleValueClass*)inAttributeBuffer;
+        for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
+            group.particleAttributes[idxAttribute+NbSymbAttributes] = ((__global FParticleValueClass*)inAttributeBuffer);
+            inAttributeBuffer += sizeof(FParticleValueClass)*group.blockHeader->nbParticlesAllocatedInGroup;
+        }
+    }
+    else{
+        group.attributesBuffer = NULLPTR;
+        for(unsigned idxAttribute = 0 ; idxAttribute < NbAttributesPerParticle ; ++idxAttribute){
+            group.particleAttributes[idxAttribute+NbSymbAttributes] = NULLPTR;
+        }
+    }
+
+    return group;
+}
+MortonIndex FOpenCLGroupOfParticles_getStartingIndex(const struct FOpenCLGroupOfParticles* group) {
+    return group->blockHeader->startingIndex;
+}
+MortonIndex FOpenCLGroupOfParticles_getEndingIndex(const struct FOpenCLGroupOfParticles* group) {
+    return group->blockHeader->endingIndex;
+}
+int FOpenCLGroupOfParticles_getNumberOfLeaves(const struct FOpenCLGroupOfParticles* group) {
+    return group->blockHeader->numberOfLeavesInBlock;
+}
+bool FOpenCLGroupOfParticles_isInside(const struct FOpenCLGroupOfParticles* group, const MortonIndex inIndex) {
+    return group->blockHeader->startingIndex <= inIndex && inIndex < group->blockHeader->endingIndex;
+}
+
+
+/** Return the idx in array of the cell */
+MortonIndex FOpenCLGroupOfParticles_getLeafMortonIndex(const struct FOpenCLGroupOfParticles* group, const int id){
+    return group->leafHeader[id].mindex;
+}
+
+/** Check if a cell exist (by binary search) and return it index */
+int FOpenCLGroupOfParticles_getLeafIndex(const struct FOpenCLGroupOfParticles* group, const MortonIndex leafIdx){
+    int idxLeft = 0;
+    int idxRight = group->blockHeader->numberOfLeavesInBlock-1;
+    while(idxLeft <= idxRight){
+        const int idxMiddle = (idxLeft+idxRight)/2;
+        if(group->leafHeader[idxMiddle].mindex == leafIdx){
+            return idxMiddle;
+        }
+        if(leafIdx < group->leafHeader[idxMiddle].mindex){
+            idxRight = idxMiddle-1;
+        }
+        else{
+            idxLeft = idxMiddle+1;
+        }
+    }
+    return -1;
+}
+
+
+bool FOpenCLGroupOfParticles_exists(const struct FOpenCLGroupOfParticles* group, const MortonIndex inIndex) {
+    return FOpenCLGroupOfParticles_isInside(group, inIndex) && (FOpenCLGroupOfParticles_getLeafIndex(group, inIndex) != -1);
+}
+struct FOpenCLGroupAttachedLeaf FOpenCLGroupOfParticles_getLeaf(struct FOpenCLGroupOfParticles* group, const int id){
+    return BuildFOpenCLGroupAttachedLeaf(group->leafHeader[id].nbParticles,
+                                         group->particlePosition[0] + group->leafHeader[id].offSet,
+            group->blockHeader->positionsLeadingDim,
+            (group->attributesBuffer?group->particleAttributes[NbSymbAttributes] + group->leafHeader[id].offSet:NULLPTR),
+            group->blockHeader->attributeLeadingDim);
+}
+
+
+struct FOpenCLGroupOfCellsBlockHeader{
+    MortonIndex startingIndex;
+    MortonIndex endingIndex;
+    int numberOfCellsInBlock;
+} __attribute__ ((aligned (DefaultStructAlign)));
+
+
+struct FOpenCLGroupOfCells {
+    //< The size of the memoryBuffer
+    size_t allocatedMemoryInByte;
+    //< Pointer to a block memory
+    __global unsigned char* memoryBuffer;
+
+    //< Pointer to the header inside the block memory
+    __global struct FOpenCLGroupOfCellsBlockHeader*    blockHeader;
+    //< Pointer to the indexes table inside the block memory
+    __global MortonIndex*    cellIndexes;
+    //< Pointer to the cells inside the block memory
+    __global struct FSymboleCellClass*      blockCells;
+
+    //< The multipole data
+    __global FPoleCellClass* cellMultipoles;
+    //< The local data
+    __global FLocalCellClass* cellLocals;
+};
+
+struct FOpenCLGroupOfCells BuildFOpenCLGroupOfCells(__global unsigned char* inBuffer, const size_t inAllocatedMemoryInByte,
+                                                    __global unsigned char* inCellMultipoles, __global unsigned char* inCellLocals){
+    struct FOpenCLGroupOfCells group;
+    group.memoryBuffer = (inBuffer);
+    group.allocatedMemoryInByte = (inAllocatedMemoryInByte);
+
+    // Move the pointers to the correct position
+    group.blockHeader         = (__global struct FOpenCLGroupOfCellsBlockHeader*)(inBuffer);
+    inBuffer += sizeof(struct FOpenCLGroupOfCellsBlockHeader);
+    group.cellIndexes   = (__global MortonIndex*)(inBuffer);
+    inBuffer += (group.blockHeader->numberOfCellsInBlock*sizeof(MortonIndex));
+    group.blockCells          = (__global struct FSymboleCellClass*)(inBuffer);
+    inBuffer += (group.blockHeader->numberOfCellsInBlock*sizeof(struct FSymboleCellClass));
+    // Assert(((size_t)(inBuffer-group.memoryBuffer) == inAllocatedMemoryInByte);
+
+    group.cellMultipoles = (__global FPoleCellClass*)inCellMultipoles;
+    group.cellLocals = (__global FLocalCellClass*)inCellLocals;
+    return group;
+}
+MortonIndex FOpenCLGroupOfCells_getStartingIndex(const struct FOpenCLGroupOfCells* group) {
+    return group->blockHeader->startingIndex;
+}
+MortonIndex FOpenCLGroupOfCells_getEndingIndex(const struct FOpenCLGroupOfCells* group) {
+    return group->blockHeader->endingIndex;
+}
+int FOpenCLGroupOfCells_getNumberOfCellsInBlock(const struct FOpenCLGroupOfCells* group) {
+    return group->blockHeader->numberOfCellsInBlock;
+}
+MortonIndex FOpenCLGroupOfCells_getSizeOfInterval(const struct FOpenCLGroupOfCells* group) {
+    return group->blockHeader->endingIndex - group->blockHeader->startingIndex;
+}
+bool FOpenCLGroupOfCells_isInside(const struct FOpenCLGroupOfCells* group, const MortonIndex inIndex){
+    return group->blockHeader->startingIndex <= inIndex && inIndex < group->blockHeader->endingIndex;
+}
+
+MortonIndex FOpenCLGroupOfCells_getCellMortonIndex(const struct FOpenCLGroupOfCells* group,const int cellPos){
+    return group->cellIndexes[cellPos];
+}
+
+int FOpenCLGroupOfCells_getCellIndex(const struct FOpenCLGroupOfCells* group,const MortonIndex cellIdx){
+    int idxLeft = 0;
+    int idxRight = group->blockHeader->numberOfCellsInBlock-1;
+    while(idxLeft <= idxRight){
+        const int idxMiddle = (idxLeft+idxRight)/2;
+        if(group->cellIndexes[idxMiddle] == cellIdx){
+            return idxMiddle;
+        }
+        if(cellIdx < group->cellIndexes[idxMiddle]){
+            idxRight = idxMiddle-1;
+        }
+        else{
+            idxLeft = idxMiddle+1;
+        }
+    }
+    return -1;
+}
+
+int FOpenCLGroupOfCells_getFistChildIdx(const struct FOpenCLGroupOfCells* group, const MortonIndex parentIdx) {
+    int idxLeft = 0;
+    int idxRight = group->blockHeader->numberOfCellsInBlock-1;
+    while(idxLeft <= idxRight){
+        int idxMiddle = (idxLeft+idxRight)/2;
+        if((group->cellIndexes[idxMiddle]>>3) == parentIdx){
+            while(0 < idxMiddle && (group->cellIndexes[idxMiddle-1]>>3) == parentIdx){
+                idxMiddle -= 1;
+            }
+            return idxMiddle;
+        }
+        if(parentIdx < (group->cellIndexes[idxMiddle]>>3)){
+            idxRight = idxMiddle-1;
+        }
+        else{
+            idxLeft = idxMiddle+1;
+        }
+    }
+    return -1;
+}
+
+
+bool FOpenCLGroupOfCells_exists(const struct FOpenCLGroupOfCells* group, const MortonIndex inIndex) {
+    return FOpenCLGroupOfCells_isInside(group, inIndex) && FOpenCLGroupOfCells_getCellIndex(group, inIndex) != -1;
+}
+struct FWrappeCell FOpenCLGroupOfCells_getCompleteCell(struct FOpenCLGroupOfCells* group, const int cellPos){
+    struct FWrappeCell cell;
+    cell.symb = &group->blockCells[cellPos];
+    cell.up = &group->cellMultipoles[cellPos];
+    cell.down = &group->cellLocals[cellPos];
+    return cell;
+}
+
+struct FWrappeCell FOpenCLGroupOfCells_getUpCell(struct FOpenCLGroupOfCells* group, const int cellPos){
+    struct FWrappeCell cell;
+    cell.symb = &group->blockCells[cellPos];
+    cell.up = &group->cellMultipoles[cellPos];
+    cell.down = NULLPTR;
+    return cell;
+}
+
+struct FWrappeCell FOpenCLGroupOfCells_getDownCell(struct FOpenCLGroupOfCells* group, const int cellPos){
+    struct FWrappeCell cell;
+    cell.symb = &group->blockCells[cellPos];
+    cell.up = NULLPTR;
+    cell.down =&group->cellLocals[cellPos];
+    return cell;
+}
+
+struct Uptr9{
+    __global unsigned char* ptrs[9];
+} __attribute__ ((aligned (DefaultStructAlign)));
+
+struct size_t9{
+    size_t v[9];
+} __attribute__ ((aligned (DefaultStructAlign)));
+
+struct Uptr343{
+    __global unsigned char* ptrs[343];
+};
+
+/***************************************************************************/
+/***************************************************************************/
+/************************CHANGE THINGS HERE*********************************/
+/***************************************************************************/
+
+
+void P2M(struct FWrappeCell pole, const struct FOpenCLGroupAttachedLeaf particles, __global void* user_data) {
+    *pole.up = particles.nbParticles;
+}
+
+void M2M(struct FWrappeCell  pole, struct FWrappeCell child[8], const int level, __global void* user_data) {
+    for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+        if(child[idxChild].symb){
+            *pole.up += *child[idxChild].up;
+        }
+    }
+}
+
+void M2L(struct FWrappeCell const pole, const struct FWrappeCell* distantNeighbors,
+         const int* relativePositions, const int size, const int level, __global void* user_data) {
+    for(int idxNeigh = 0 ; idxNeigh < size ; ++idxNeigh){
+        *pole.down += *distantNeighbors[idxNeigh].up;
+    }
+}
+
+void L2L(const struct FWrappeCell localCell, struct FWrappeCell child[8], const int level, __global void* user_data) {
+    for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+        if(child[idxChild].symb){
+            *child[idxChild].down += *localCell.down;
+        }
+    }
+}
+
+void L2P(const struct FWrappeCell localCell, struct FOpenCLGroupAttachedLeaf particles, __global void* user_data){
+    __global long long* partdown = particles.attributes[0];
+    for(FSize idxPart = 0 ; idxPart < particles.nbParticles ; ++idxPart){
+        partdown[idxPart] += *localCell.down;
+    }
+}
+
+void P2P(const int3 pos,
+         struct FOpenCLGroupAttachedLeaf  targets, const struct FOpenCLGroupAttachedLeaf sources,
+         struct FOpenCLGroupAttachedLeaf directNeighborsParticles[27], int directNeighborsPositions[27], const int counter, __global void* user_data){
+    long long cumul = sources.nbParticles-1;
+
+    for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
+        if(FOpenCLGroupAttachedLeaf_isAttachedToSomething(&directNeighborsParticles[idxNeigh])){
+            cumul += directNeighborsParticles[idxNeigh].nbParticles;
+        }
+    }
+
+    __global long long* partdown = targets.attributes[0];
+    for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){
+        partdown[idxPart] += cumul;
+    }
+}
+
+void P2PRemote(const int3 pos,
+               struct FOpenCLGroupAttachedLeaf  targets, const struct FOpenCLGroupAttachedLeaf  sources,
+               struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){
+    __global long long* partdown = targets.attributes[0];
+    for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){
+        partdown[idxPart] += directNeighborsParticles.nbParticles;
+    }
+}
+
+void P2POuter(const int3 pos,
+               struct FOpenCLGroupAttachedLeaf  targets, const struct FOpenCLGroupAttachedLeaf  sources,
+               struct FOpenCLGroupAttachedLeaf directNeighborsParticles, const int position, __global void* user_data){
+    __global long long* partdown = targets.attributes[0];
+    for(FSize idxPart = 0 ; idxPart < targets.nbParticles ; ++idxPart){
+        partdown[idxPart] += directNeighborsParticles.nbParticles;
+    }
+}
+
+int3 getCoordinate(const struct FWrappeCell cell) {
+    int3 coord;
+    coord.x = cell.symb->coordinates[0];
+    coord.y = cell.symb->coordinates[1];
+    coord.z = cell.symb->coordinates[2];
+    return coord;
+}
+
+
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+/***************************************************************************/
+
+#define FOpenCLCheck( test ) { FOpenCLCheckCore((test), __FILE__, __LINE__); }
+#define FOpenCLCheckAfterCall() { FOpenCLCheckCore((cudaGetLastError()), __FILE__, __LINE__); }
+#define FOpenCLAssertLF(ARGS) if(!(ARGS)){ *((char*)0x09) = 'e'; }
+//#define FOpenCLAssertLF(ARGS) ARGS;
+
+#define FMGetOppositeNeighIndex(index) (27-(index)-1)
+#define FMGetOppositeInterIndex(index) (343-(index)-1)
+
+#define FOpenCLMax(x,y) ((x)<(y) ? (y) : (x))
+#define FOpenCLMin(x,y) ((x)>(y) ? (y) : (x))
+
+
+__kernel void FOpenCL__bottomPassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize,__global unsigned char* leafCellsUpPtr,
+                                         __global unsigned char* containersPtr, size_t containersSize,
+                                         __global void* userkernel ){
+    struct FOpenCLGroupOfCells leafCells = BuildFOpenCLGroupOfCells(leafCellsPtr, leafCellsSize, leafCellsUpPtr, NULLPTR);
+    struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, NULLPTR);
+
+    const int nbLeaves = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&leafCells);
+
+    for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&leafCells, idxLeaf);
+        FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf));
+        struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf);
+        FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf) == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf));
+        P2M(cell, particles, userkernel);
+    }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Upward Pass
+/////////////////////////////////////////////////////////////////////////////////////
+
+__kernel void FOpenCL__upwardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsUpPtr,
+                                         __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsUpPtr,
+                                         int idxLevel, __global void* userkernel){
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR);
+    const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&currentCells);
+    struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, childCellsUpPtr, NULLPTR);
+    const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells);
+
+    const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(&currentCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3);
+    const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(&currentCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3);
+
+    int idxParentCell = FOpenCLGroupOfCells_getCellIndex(&currentCells,firstParent);
+    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent);
+
+    while(true){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&currentCells, idxParentCell);
+        struct FWrappeCell child[8];
+        for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+            child[idxChild].symb = NULLPTR;
+        }
+
+        do{
+            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7);
+            child[idxChild] = FOpenCLGroupOfCells_getUpCell(&childCells, idxChildCell);
+            idxChildCell += 1;
+        }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3));
+
+        M2M(cell, child, idxLevel, userkernel);
+
+        if(FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
+    }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Transfer Pass Mpi
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+__kernel  void FOpenCL__transferInoutPassPerformMpi(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,
+                                                    __global unsigned char* externalCellsPtr, size_t externalCellsSize, __global unsigned char* externalCellsUpPtr,
+                                                    int idxLevel, const __global struct OutOfBlockInteraction* outsideInteractions,
+                                                    size_t nbOutsideInteractions, __global void* userkernel){
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr);
+    struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, externalCellsUpPtr, NULLPTR);
+
+    for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+        const int cellPos = FOpenCLGroupOfCells_getCellIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex);
+        if(cellPos != -1){
+            FOpenCLAssertLF(outsideInteractions[outInterIdx].outIndex == FOpenCLGroupOfCells_getCellMortonIndex(&cellsOther, outsideInteractions[outInterIdx].outIndex));
+            struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, cellPos);
+            FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
+            struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
+            FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
+
+            const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition;
+            M2L( cell , &interCell, &relativeOutPosition,
+                 1, idxLevel, userkernel);
+        }
+    }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Transfer Pass
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+
+__kernel  void FOpenCL__transferInPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,
+                                              __global unsigned char* currentCellsUpPtr, __global unsigned char* currentCellsDownPtr,
+                                              int idxLevel, __global void* userkernel){
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, currentCellsDownPtr);
+
+    const MortonIndex blockStartIdx = FOpenCLGroupOfCells_getStartingIndex(&currentCells);
+    const MortonIndex blockEndIdx = FOpenCLGroupOfCells_getEndingIndex(&currentCells);
+
+    const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&currentCells);
+
+    for(int idxCell = 0 ; idxCell < nbCells ; ++idxCell){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, idxCell);
+        FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxCell));
+        MortonIndex interactionsIndexes[189];
+        int interactionsPosition[189];
+        const int3 coord = (getCoordinate(cell));
+        int counter = GetInteractionNeighbors(coord, idxLevel,interactionsIndexes,interactionsPosition);
+
+        struct FWrappeCell interactions[343];
+        FSetToNullptr343(interactions);
+        int counterExistingCell = 0;
+
+        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
+            if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){
+                const int cellPos = FOpenCLGroupOfCells_getCellIndex(&currentCells, interactionsIndexes[idxInter]);
+                if(cellPos != -1){
+                    struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&currentCells, cellPos);
+                    interactions[counterExistingCell] = interCell;
+                    interactionsPosition[counterExistingCell] = interactionsPosition[idxInter];
+                    counterExistingCell += 1;
+                }
+            }
+        }
+
+        M2L( cell , interactions, interactionsPosition,
+             counterExistingCell, idxLevel, userkernel);
+    }
+}
+
+
+
+__kernel void FOpenCL__transferInoutPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize,
+                                                __global unsigned char*  currentCellsUpPtr,
+                                                __global unsigned char* externalCellsPtr, size_t externalCellsSize,
+                                                __global unsigned char* externalCellsDownPtr,
+                                                int idxLevel, int mode, const __global struct OutOfBlockInteraction* outsideInteractions,
+                                                size_t nbOutsideInteractions, __global void* userkernel){
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, currentCellsUpPtr, NULLPTR);
+    struct FOpenCLGroupOfCells cellsOther = BuildFOpenCLGroupOfCells(externalCellsPtr, externalCellsSize, NULLPTR, externalCellsDownPtr);
+
+    if(mode == 1){
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            struct FWrappeCell interCell = FOpenCLGroupOfCells_getUpCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
+            FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
+            struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
+            FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
+
+            const int relativeOutPosition = outsideInteractions[outInterIdx].relativeOutPosition;
+            M2L( cell , &interCell, &relativeOutPosition,
+                 1, idxLevel, userkernel);
+        }
+    }
+    else{
+        for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+            struct FWrappeCell interCell = FOpenCLGroupOfCells_getDownCell(&cellsOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
+            FOpenCLAssertLF(interCell.symb->mortonIndex == outsideInteractions[outInterIdx].outIndex);
+            struct FWrappeCell cell = FOpenCLGroupOfCells_getUpCell(&currentCells, outsideInteractions[outInterIdx].insideIdxInBlock);
+            FOpenCLAssertLF(cell.symb->mortonIndex == outsideInteractions[outInterIdx].insideIndex);
+
+            const int relativepos = FMGetOppositeInterIndex(outsideInteractions[outInterIdx].relativeOutPosition);
+            M2L( interCell , &cell, &relativepos, 1, idxLevel, userkernel);
+        }
+    }
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Downard Pass
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+__kernel void FOpenCL__downardPassPerform(__global unsigned char* currentCellsPtr, size_t currentCellsSize, __global unsigned char* currentCellsDownPtr,
+                                          __global unsigned char* childCellsPtr, size_t childCellsSize, __global unsigned char* childCellsDownPtr,
+                                          int idxLevel, __global void* userkernel){
+    struct FOpenCLGroupOfCells currentCells = BuildFOpenCLGroupOfCells(currentCellsPtr, currentCellsSize, NULLPTR, currentCellsDownPtr);
+    const int nbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&currentCells);
+    struct FOpenCLGroupOfCells childCells = BuildFOpenCLGroupOfCells(childCellsPtr, childCellsSize, NULLPTR, childCellsDownPtr);
+    const int childNbCells = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&childCells);
+
+    const MortonIndex firstParent = FOpenCLMax(FOpenCLGroupOfCells_getStartingIndex(&currentCells), FOpenCLGroupOfCells_getStartingIndex(&childCells)>>3);
+    const MortonIndex lastParent = FOpenCLMin(FOpenCLGroupOfCells_getEndingIndex(&currentCells)-1, (FOpenCLGroupOfCells_getEndingIndex(&childCells)-1)>>3);
+
+    int idxParentCell = FOpenCLGroupOfCells_getCellIndex(&currentCells,firstParent);
+    int idxChildCell = FOpenCLGroupOfCells_getFistChildIdx(&childCells,firstParent);
+
+    while(true){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&currentCells, idxParentCell);
+        struct FWrappeCell child[8];
+        for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
+            child[idxChild].symb = NULLPTR;
+        }
+
+        do{
+            const int idxChild = ((FOpenCLGroupOfCells_getCellMortonIndex(&childCells,idxChildCell)) & 7);
+            child[idxChild] = FOpenCLGroupOfCells_getDownCell(&childCells, idxChildCell);
+            idxChildCell += 1;
+        }while(idxChildCell != childNbCells && cell.symb->mortonIndex == (FOpenCLGroupOfCells_getCellMortonIndex(&childCells, idxChildCell)>>3));
+
+        L2L(cell, child, idxLevel, userkernel);
+
+        if(FOpenCLGroupOfCells_getCellMortonIndex(&currentCells, idxParentCell) == lastParent){
+            break;
+        }
+
+        idxParentCell += 1;
+    }
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Direct Pass MPI
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+__kernel void FOpenCL__directInoutPassPerformMpi(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,
+                                                 __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* outsideInteractionsCl,
+                                                 const __global struct OutOfBlockInteraction* outsideInteractions,
+                                                 size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){
+    struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr);
+    struct FOpenCLGroupOfParticles containersOther = BuildFOpenCLGroupOfParticles(externalContainersPtr, externalContainersSize, NULLPTR);
+
+    for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+        const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex);
+        if(leafPos != -1){
+            FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containersOther, leafPos) == outsideInteractions[outInterIdx].outIndex);
+            struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, leafPos);
+            struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock);
+            FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex);
+
+            P2PRemote( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles ,
+                       interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel);
+        }
+    }
+}
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Direct Pass
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+
+__kernel void FOpenCL__directInPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,
+                                           const int treeHeight, __global void* userkernel){
+    struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr);
+
+    const MortonIndex blockStartIdx = FOpenCLGroupOfParticles_getStartingIndex(&containers);
+    const MortonIndex blockEndIdx = FOpenCLGroupOfParticles_getEndingIndex(&containers);
+
+    const int nbLeaves = FOpenCLGroupOfParticles_getNumberOfLeaves(&containers);
+
+    for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
+        struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf);
+        MortonIndex interactionsIndexes[26];
+        int interactionsPosition[26];
+        const int3 coord = GetPositionFromMorton(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf), treeHeight-1);
+        int counter = GetNeighborsIndexes(coord, treeHeight,interactionsIndexes,interactionsPosition);
+
+        struct FOpenCLGroupAttachedLeaf interactionsObjects[27];
+        int neighPosition[26];
+        int counterExistingCell = 0;
+
+        for(int idxInter = 0 ; idxInter < counter ; ++idxInter){
+            if( blockStartIdx <= interactionsIndexes[idxInter] && interactionsIndexes[idxInter] < blockEndIdx ){
+                const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containers, interactionsIndexes[idxInter]);
+                if(leafPos != -1){
+                    FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, leafPos) == interactionsIndexes[idxInter]);
+                    interactionsObjects[counterExistingCell] = FOpenCLGroupOfParticles_getLeaf(&containers, leafPos);
+                    neighPosition[counterExistingCell] = interactionsPosition[idxInter];
+                    counterExistingCell += 1;
+                }
+            }
+        }
+
+        P2P( coord, particles, particles , interactionsObjects, neighPosition, counterExistingCell, userkernel);
+    }
+}
+
+
+
+__kernel void FOpenCL__directInoutPassPerform(__global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,
+                                              __global unsigned char* externalContainersPtr, size_t externalContainersSize, __global unsigned char* externalContainersDownPtr,
+                                              const __global struct OutOfBlockInteraction* outsideInteractions,
+                                              size_t nbOutsideInteractions, const int treeHeight, __global void* userkernel){
+    struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr, containersSize, containersDownPtr);
+    struct FOpenCLGroupOfParticles containersOther = BuildFOpenCLGroupOfParticles(externalContainersPtr, externalContainersSize, externalContainersDownPtr);
+
+    for(int outInterIdx = 0 ; outInterIdx < nbOutsideInteractions ; ++outInterIdx){
+        const int leafPos = FOpenCLGroupOfParticles_getLeafIndex(&containersOther, outsideInteractions[outInterIdx].outIndex);
+        if(leafPos != -1){
+            struct FOpenCLGroupAttachedLeaf interParticles = FOpenCLGroupOfParticles_getLeaf(&containersOther, outsideInteractions[outInterIdx].outsideIdxInBlock);
+            struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, outsideInteractions[outInterIdx].insideIdxInBlock);
+
+            FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, outsideInteractions[outInterIdx].insideIdxInBlock) == outsideInteractions[outInterIdx].insideIndex);
+            FOpenCLAssertLF(particles.nbParticles);
+            FOpenCLAssertLF(interParticles.nbParticles);
+
+            P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].insideIndex, treeHeight-1), particles, particles ,
+                      interParticles, outsideInteractions[outInterIdx].relativeOutPosition, userkernel );
+
+            P2POuter( GetPositionFromMorton(outsideInteractions[outInterIdx].outIndex, treeHeight-1), interParticles, interParticles ,
+                      particles, FMGetOppositeNeighIndex(outsideInteractions[outInterIdx].relativeOutPosition), userkernel);
+        }
+    }
+}
+
+
+
+/////////////////////////////////////////////////////////////////////////////////////
+/// Merge Pass
+/////////////////////////////////////////////////////////////////////////////////////
+
+
+
+__kernel void FOpenCL__mergePassPerform(__global unsigned char* leafCellsPtr, size_t leafCellsSize, __global unsigned char* leafCellsDownPtr,
+                                        __global unsigned char* containersPtr, size_t containersSize, __global unsigned char* containersDownPtr,
+                                        __global void* userkernel){
+    struct FOpenCLGroupOfCells leafCells = BuildFOpenCLGroupOfCells(leafCellsPtr,leafCellsSize, NULLPTR, leafCellsDownPtr);
+    struct FOpenCLGroupOfParticles containers = BuildFOpenCLGroupOfParticles(containersPtr,containersSize, containersDownPtr);
+
+    const int nbLeaves = FOpenCLGroupOfCells_getNumberOfCellsInBlock(&leafCells);
+
+    for(int idxLeaf = 0 ; idxLeaf < nbLeaves ; ++idxLeaf){
+        struct FWrappeCell cell = FOpenCLGroupOfCells_getDownCell(&leafCells, idxLeaf);
+        FOpenCLAssertLF(cell.symb->mortonIndex == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf));
+        struct FOpenCLGroupAttachedLeaf particles = FOpenCLGroupOfParticles_getLeaf(&containers, idxLeaf);
+        FOpenCLAssertLF(FOpenCLGroupOfParticles_getLeafMortonIndex(&containers, idxLeaf) == FOpenCLGroupOfCells_getCellMortonIndex(&leafCells, idxLeaf));
+        L2P(cell, particles, userkernel);
+    }
+}
+
diff --git a/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp b/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..4556fa722c755c46fb7be820ff26badfc85a04c5
--- /dev/null
+++ b/Src/GroupTree/Uniform/FUniformOpenCLCode.hpp
@@ -0,0 +1,67 @@
+#ifndef FUNIFORMOPENCLCODE_HPP
+#define FUNIFORMOPENCLCODE_HPP
+
+
+#include "../../Utils/FGlobal.hpp"
+#include "../StarPUUtils/FStarPUDefaultAlign.hpp"
+#include "../OpenCl/FTextReplacer.hpp"
+
+#include "../../Kernels/Uniform/FUnifCell.hpp"
+
+// Initialize the types
+template <class FReal, const int ORDER>
+class FUniformOpenCLCode{
+    FTextReplacer kernelfile;
+    size_t dim;
+
+public:
+    FUniformOpenCLCode() : kernelfile("../Src/GroupTree/Uniform/FUniformKernel.cl"){
+        if(sizeof(FReal) == sizeof(double)){
+            kernelfile.replaceAll("___FReal___", "double");
+        }
+        else{
+            kernelfile.replaceAll("___FReal___", "float");
+        }
+        FAssertLF((typeid(FSize) == typeid(long long int)));
+        kernelfile.replaceAll("___FSize___", "long long int");
+        kernelfile.replaceAll("___FParticleValueClass___", "long long");
+        kernelfile.replaceAll("___NbSymbAttributes___", 0);
+        kernelfile.replaceAll("___NbAttributesPerParticle___", 1);
+        const size_t structAlign = FStarPUDefaultAlign::StructAlign;
+        kernelfile.replaceAll("___DefaultStructAlign___", structAlign);
+        kernelfile.replaceAll("___FP2PDefaultAlignement___", FP2PDefaultAlignement);
+
+        kernelfile.replaceAll("__ORDER__", ORDER);
+        FUnifCell<FReal, ORDER> cell;
+        kernelfile.replaceAll("__POLE_SIZE__", cell.getVectorSize());
+        kernelfile.replaceAll("__LOCAL_SIZE__", cell.getVectorSize());
+
+        dim = 1;
+    }
+
+    const char* getKernelCode(const int /*inDevId*/){
+        return kernelfile.getContent();
+    }
+
+    void releaseKernelCode(){
+        kernelfile.clear();
+    }
+
+    unsigned int getNbDims() const {
+        return 1;
+    }
+
+    const size_t* getNbGroups(const int /*inSizeInterval*/) const {
+        // We return 1
+        return &dim;
+    }
+
+    const size_t* getGroupSize() const {
+        // We return 1
+        return &dim;
+    }
+};
+
+
+#endif // FUNIFORMOPENCLCODE_HPP
+
diff --git a/Src/ScalFmmConfig.h.cmake b/Src/ScalFmmConfig.h.cmake
index 9ae8f6fbc3a531ceeb8fd910f5f1c6c51a9455e7..d62c7af25af646e0d202836ec805023327e2ef38 100644
--- a/Src/ScalFmmConfig.h.cmake
+++ b/Src/ScalFmmConfig.h.cmake
@@ -131,6 +131,12 @@ const std::string SCALFMMCompileLibs("@SCALFMM_COMPILE_LIBS@");
 
 #cmakedefine OPENMP_SUPPORT_PRIORITY
 
+///////////////////////////////////////////////////////
+// To use a taskname clause for tasks with KSTAR OMP4
+///////////////////////////////////////////////////////
+
+#cmakedefine OPENMP_SUPPORT_TASK_NAME
+
 ///////////////////////////////////////////////////////
 // To record omp4 task times for statistics
 ///////////////////////////////////////////////////////
diff --git a/Src/Utils/FMpi.hpp b/Src/Utils/FMpi.hpp
index dec2ee94da810f4f3e545ade0cf930ea8fbd7e5f..f0654ea8ea223c919ca278774fecc095be29bf9e 100644
--- a/Src/Utils/FMpi.hpp
+++ b/Src/Utils/FMpi.hpp
@@ -28,6 +28,7 @@
 
 #include "FNoCopyable.hpp"
 #include "FMath.hpp"
+#include "FAssert.hpp"
 
 //Need that for converting datas
 #include "FComplex.hpp"
@@ -250,6 +251,13 @@ public:
         void barrier() const {
             FMpi::Assert(MPI_Barrier(getComm()), __LINE__);
         }
+
+        bool hasPendingMessage() const {
+            MPI_Status status;
+            int flag = 0;
+            MPI_Iprobe(MPI_ANY_SOURCE, MPI_ANY_TAG, getComm(), &flag, &status);
+            return (flag != 0);
+        }
     };
 
     ////////////////////////////////////////////////////////
@@ -427,6 +435,44 @@ public:
         }
     }
 
+    static const size_t MaxBytesPerDivMess = 20000000;
+
+    template <class ObjectType, class VectorType>
+    static int ISendSplit(const ObjectType toSend[], const size_t nbItems,
+                          const int dest, const int tagBase, const FMpi::FComm& communicator,
+                          VectorType* requestVector){
+        const size_t totalByteToSend  = (nbItems*sizeof(ObjectType));
+        unsigned char*const ptrDataToSend = (unsigned char*)const_cast<ObjectType*>(toSend);
+        for(size_t idxSize = 0 ; idxSize < totalByteToSend ; idxSize += MaxBytesPerDivMess){
+            MPI_Request currentRequest;
+            const size_t nbBytesInMessage = FMath::Min(MaxBytesPerDivMess, totalByteToSend-idxSize);
+            FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max());
+            FMpi::Assert( MPI_Isend(&ptrDataToSend[idxSize], int(nbBytesInMessage), MPI_BYTE , dest,
+                          tagBase + int(idxSize/MaxBytesPerDivMess), communicator.getComm(), &currentRequest) , __LINE__);
+
+            requestVector->push_back(currentRequest);
+        }
+        return int((totalByteToSend+MaxBytesPerDivMess-1)/MaxBytesPerDivMess);
+    }
+
+    template <class ObjectType, class VectorType>
+    static int IRecvSplit(ObjectType toRecv[], const size_t nbItems,
+                          const int source, const int tagBase, const FMpi::FComm& communicator,
+                          VectorType* requestVector){
+        const size_t totalByteToRecv  = (nbItems*sizeof(ObjectType));
+        unsigned char*const ptrDataToRecv = (unsigned char*)(toRecv);
+        for(size_t idxSize = 0 ; idxSize < totalByteToRecv ; idxSize += MaxBytesPerDivMess){
+            MPI_Request currentRequest;
+            const size_t nbBytesInMessage = FMath::Min(MaxBytesPerDivMess, totalByteToRecv-idxSize);
+            FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max());
+            FMpi::Assert( MPI_Irecv(&ptrDataToRecv[idxSize], int(nbBytesInMessage), MPI_BYTE , source,
+                          tagBase + int(idxSize/MaxBytesPerDivMess), communicator.getComm(), &currentRequest) , __LINE__);
+
+            requestVector->push_back(currentRequest);
+        }
+        return int((totalByteToRecv+MaxBytesPerDivMess-1)/MaxBytesPerDivMess);
+    }
+
 private:
     /// The original communicator
     FComm* communicator;
diff --git a/Src/Utils/FQuickSortMpi.hpp b/Src/Utils/FQuickSortMpi.hpp
index 848cf58f76914aabf30433e709d02ecb282e4ef5..437d2ede4e2f81538ffb9ecdef515f88be01b197 100644
--- a/Src/Utils/FQuickSortMpi.hpp
+++ b/Src/Utils/FQuickSortMpi.hpp
@@ -20,14 +20,16 @@
 #include "FMpi.hpp"
 #include "FLog.hpp"
 #include "FAssert.hpp"
+#include "FEnv.hpp"
 
 #include <memory>
 #include <utility>
 
 template <class SortType, class CompareType, class IndexType = size_t>
 class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
-    /** We are limited by the size of int in MPI coms */
-    static const int FQS_MAX_MPI_BYTES = 2000000000;
+#ifdef SCALFMM_USE_LOG
+    static const bool VerboseLog;
+#endif
 
     // We need a structure see the algorithm detail to know more
     struct Partition{
@@ -96,6 +98,7 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
                 const IndexType nbElementsAlreadyOwned = (inFromRightToLeft ? globalElementBalance[idxProc].lowerPart : globalElementBalance[idxProc].greaterPart);
                 const IndexType averageNbElementForRemainingProc = (totalRemainingElements)/(lastProcToRecv-idxProc);
                 totalRemainingElements -= nbElementsAlreadyOwned;
+                FAssertLF(totalRemainingElements >= 0);
                 if(nbElementsAlreadyOwned < averageNbElementForRemainingProc){
                     nbElementsToRecvPerProc[idxProc - firstProcToRecv] = (averageNbElementForRemainingProc - nbElementsAlreadyOwned);
                     totalRemainingElements -= nbElementsToRecvPerProc[idxProc - firstProcToRecv];
@@ -103,8 +106,9 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
                 else{
                     nbElementsToRecvPerProc[idxProc - firstProcToRecv] = 0;
                 }
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentRank << "] nbElementsToRecvPerProc[" << idxProc << "] = " << nbElementsToRecvPerProc[idxProc - firstProcToRecv] << "\n"; )
+                FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentRank << "] nbElementsToRecvPerProc[" << idxProc << "] = " << nbElementsToRecvPerProc[idxProc - firstProcToRecv] << "\n"; )
             }
+            FAssertLF(totalRemainingElements == 0);
         }
 
         // Store in an array the number of element to send
@@ -113,7 +117,7 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
         for(int idxProc = firstProcToSend; idxProc < lastProcToSend ; ++idxProc){
             const IndexType nbElementsAlreadyOwned = (inFromRightToLeft ? globalElementBalance[idxProc].lowerPart : globalElementBalance[idxProc].greaterPart);
             nbElementsToSendPerProc[idxProc-firstProcToSend] = nbElementsAlreadyOwned;
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentRank << "] nbElementsToSendPerProc[" << idxProc << "] = " << nbElementsToSendPerProc[idxProc-firstProcToSend] << "\n"; )
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentRank << "] nbElementsToSendPerProc[" << idxProc << "] = " << nbElementsToSendPerProc[idxProc-firstProcToSend] << "\n"; )
         }
 
         // Compute all the send recv but keep only the ones related to currentRank
@@ -177,31 +181,23 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
         requests.reserve(whatToRecvFromWho.size());
         for(int idxPack = 0 ; idxPack < int(whatToRecvFromWho.size()) ; ++idxPack){
             const PackData& pack = whatToRecvFromWho[idxPack];
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Recv from " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; );
-//             FAssertLF((pack.toElement - pack.fromElement) * sizeof(SortType) < std::numeric_limits<int>::max());
-//             FMpi::Assert( MPI_Irecv((SortType*)&recvBuffer[pack.fromElement], int((pack.toElement - pack.fromElement) * sizeof(SortType)), MPI_BYTE, pack.idProc,
-//                          FMpi::TagQuickSort, currentComm.getComm(), &requests[idxPack]) , __LINE__);
-            // Work per max size
-            const IndexType nbElementsInPack = (pack.toElement - pack.fromElement);
-            const IndexType totalByteToRecv  = IndexType(nbElementsInPack*sizeof(SortType));
-            unsigned char*const ptrDataToRecv = (unsigned char*)&recvBuffer[pack.fromElement];
-            for(IndexType idxSize = 0 ; idxSize < totalByteToRecv ; idxSize += FQS_MAX_MPI_BYTES){
-                MPI_Request currentRequest;
-                const FSize nbBytesInMessage = int(FMath::Min(IndexType(FQS_MAX_MPI_BYTES), totalByteToRecv-idxSize));
-                FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max());
-                FMpi::Assert( MPI_Irecv(&ptrDataToRecv[idxSize], int(nbBytesInMessage), MPI_BYTE, pack.idProc,
-                              int(FMpi::TagQuickSort + idxSize/FQS_MAX_MPI_BYTES), currentComm.getComm(), &currentRequest) , __LINE__);
-
-                requests.push_back(currentRequest);
-            }
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Recv from " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; );
+            FAssertLF(pack.toElement <= totalToRecv);
+            FMpi::IRecvSplit(&recvBuffer[pack.fromElement],
+                    (pack.toElement - pack.fromElement),
+                    pack.idProc,
+                    FMpi::TagQuickSort,
+                    currentComm,
+                    &requests);
+
         }
         FAssertLF(whatToRecvFromWho.size() <= requests.size());
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << "Wait for " << requests.size() << " request \n" );
-        FLOG( FLog::Controller.flush());
+        FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << "Wait for " << requests.size() << " request \n" );
+        FLOG(if(VerboseLog)  FLog::Controller.flush());
         // Wait to complete
         FMpi::Assert( MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE),  __LINE__ );
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Recv Done \n"; )
-                FLOG( FLog::Controller.flush());
+        FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Recv Done \n"; )
+                FLOG(if(VerboseLog)  FLog::Controller.flush());
         // Copy to ouput variables
         (*inPartRecv) = recvBuffer;
         (*inNbElementsRecv) = totalToRecv;
@@ -220,31 +216,22 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
         requests.reserve(whatToSendToWho.size());
         for(int idxPack = 0 ; idxPack < int(whatToSendToWho.size()) ; ++idxPack){
             const PackData& pack = whatToSendToWho[idxPack];
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Send to " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; );
-//            FAssertLF((pack.toElement - pack.fromElement)* sizeof(SortType) < std::numeric_limits<int>::max());
-//            FMpi::Assert( MPI_Isend(const_cast<SortType*>(&inPartToSend[pack.fromElement]), int((pack.toElement - pack.fromElement) * sizeof(SortType)), MPI_BYTE , pack.idProc,
-//                          FMpi::TagQuickSort, currentComm.getComm(), &requests[idxPack]) , __LINE__);
-            // Work per max size
-            const IndexType nbElementsInPack = (pack.toElement - pack.fromElement);
-            const IndexType totalByteToSend  = IndexType(nbElementsInPack*sizeof(SortType));
-            unsigned char*const ptrDataToSend = (unsigned char*)const_cast<SortType*>(&inPartToSend[pack.fromElement]);
-            for(IndexType idxSize = 0 ; idxSize < totalByteToSend ; idxSize += FQS_MAX_MPI_BYTES){
-                MPI_Request currentRequest;
-                const IndexType nbBytesInMessage = int(FMath::Min(IndexType(FQS_MAX_MPI_BYTES), totalByteToSend-idxSize));
-                FAssertLF(nbBytesInMessage < std::numeric_limits<int>::max());
-                FMpi::Assert( MPI_Isend((SortType*)&ptrDataToSend[idxSize], int(nbBytesInMessage), MPI_BYTE , pack.idProc,
-                              int(FMpi::TagQuickSort + idxSize/FQS_MAX_MPI_BYTES), currentComm.getComm(), &currentRequest) , __LINE__);
-
-                requests.push_back(currentRequest);
-            }
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Send to " << pack.idProc << " from " << pack.fromElement << " to " << pack.toElement << "\n"; );
+
+            FMpi::ISendSplit(&inPartToSend[pack.fromElement],
+                    (pack.toElement - pack.fromElement),
+                    pack.idProc,
+                    FMpi::TagQuickSort,
+                    currentComm,
+                    &requests);
         }
         FAssertLF(whatToSendToWho.size() <= requests.size());
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << "Wait for " << requests.size() << " request \n" );
-        FLOG( FLog::Controller.flush());
+        FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG [" << currentComm.processId() << "] Wait for " << requests.size() << " request \n" );
+        FLOG(if(VerboseLog)  FLog::Controller.flush());
         // Wait to complete
         FMpi::Assert( MPI_Waitall(int(requests.size()), requests.data(), MPI_STATUSES_IGNORE),  __LINE__ );
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Send Done \n"; )
-                FLOG( FLog::Controller.flush());
+        FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Send Done \n"; )
+                FLOG(if(VerboseLog)  FLog::Controller.flush());
     }
 
     static CompareType SelectPivot(const SortType workingArray[], const IndexType currentSize, const FMpi::FComm& currentComm, bool* shouldStop){
@@ -253,33 +240,38 @@ class FQuickSortMpi : public FQuickSort< SortType, IndexType> {
             NO_VALUES,
             AVERAGE_2
         };
-        // We need to know the max value to ensure that the pivot will be different
-        CompareType maxFoundValue = CompareType(workingArray[0]);
         // Check if all the same
         bool allTheSame = true;
-        for(int idx = 1 ; idx < currentSize && allTheSame; ++idx){
-            if(workingArray[0] != workingArray[idx]){
-                allTheSame = false;
-            }
-            // Keep the max
-            maxFoundValue = FMath::Max(maxFoundValue , CompareType(workingArray[idx]));
-        }
         // Check if empty
         const bool noValues = (currentSize == 0);
         // Get the local pivot if not empty
         CompareType localPivot = CompareType(0);
-        if(!noValues){
-            localPivot = (CompareType(workingArray[currentSize/3])+CompareType(workingArray[(2*currentSize)/3]))/2;
+
+        if(noValues == false){
+            // We need to know the max value to ensure that the pivot will be different
+            CompareType maxFoundValue = CompareType(workingArray[0]);
+            // We need to know the min value to ensure that the pivot will be different
+            CompareType minFoundValue = CompareType(workingArray[0]);
+
+            for(int idx = 1 ; idx < currentSize ; ++idx){
+                // Keep the max
+                maxFoundValue = FMath::Max(maxFoundValue , CompareType(workingArray[idx]));
+                // Keep the min
+                minFoundValue = FMath::Min(minFoundValue , CompareType(workingArray[idx]));
+            }
+            allTheSame = (maxFoundValue == minFoundValue);
+            // Value equal to pivot are kept on the left so
+            localPivot = ((maxFoundValue-minFoundValue)/2) + minFoundValue;
             // The pivot must be different (to ensure that the partition will return two parts)
             if( localPivot == maxFoundValue && !allTheSame){
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Pivot " << localPivot << " is equal max and allTheSame equal " << allTheSame << "\n"; )
-                        FLOG( FLog::Controller.flush());
+                FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Pivot " << localPivot << " is equal max and allTheSame equal " << allTheSame << "\n"; )
+                        FLOG(if(VerboseLog)  FLog::Controller.flush());
                 localPivot -= 1;
             }
         }
 
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] localPivot = " << localPivot << "\n" );
-        FLOG( FLog::Controller.flush());
+        FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] localPivot = " << localPivot << "\n" );
+        FLOG(if(VerboseLog)  FLog::Controller.flush());
 
         //const int myRank = currentComm.processId();
         const int nbProcs = currentComm.processCount();
@@ -339,20 +331,20 @@ public:
             bool shouldStop;
             const CompareType globalPivot = SelectPivot(workingArray, currentSize, currentComm, &shouldStop);
             if(shouldStop){
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] shouldStop = " << shouldStop << "\n" );
-                FLOG( FLog::Controller.flush());
+                FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] shouldStop = " << shouldStop << "\n" );
+                FLOG(if(VerboseLog)  FLog::Controller.flush());
                 break;
             }
 
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalPivot = " << globalPivot << "\n" );
-            FLOG( FLog::Controller.flush());
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalPivot = " << globalPivot << "\n" );
+            FLOG(if(VerboseLog)  FLog::Controller.flush());
 
             // Split the array in two parts lower equal to pivot and greater than pivot
             const IndexType nbLowerElements = QsPartition(workingArray, 0, currentSize-1, globalPivot);
             const IndexType nbGreaterElements = currentSize - nbLowerElements;
 
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] After Partition: lower = " << nbLowerElements << " greater = " << nbGreaterElements << "\n"; )
-                    FLOG( FLog::Controller.flush());
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] After Partition: lower = " << nbLowerElements << " greater = " << nbGreaterElements << "\n"; )
+                    FLOG(if(VerboseLog)  FLog::Controller.flush());
 
             const int currentRank = currentComm.processId();
             const int currentNbProcs = currentComm.processCount();
@@ -378,19 +370,19 @@ public:
                 globalNumberOfElementsLower   += globalElementBalance[idxProc].lowerPart;
             }
 
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalNumberOfElementsGreater = " << globalNumberOfElementsGreater << "\n"; )
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalNumberOfElementsLower   = " << globalNumberOfElementsLower << "\n"; )
-                    FLOG( FLog::Controller.flush());
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalNumberOfElementsGreater = " << globalNumberOfElementsGreater << "\n"; )
+                    FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] globalNumberOfElementsLower   = " << globalNumberOfElementsLower << "\n"; )
+                    FLOG(if(VerboseLog)  FLog::Controller.flush());
 
             // The proc rank in the middle from the percentage
             int procInTheMiddle;
             if(globalNumberOfElementsLower == 0)        procInTheMiddle = -1;
             else if(globalNumberOfElementsGreater == 0) procInTheMiddle = currentNbProcs-1;
             else procInTheMiddle = int(FMath::Min(IndexType(currentNbProcs-2), (currentNbProcs*globalNumberOfElementsLower)
-                                              /(globalNumberOfElementsGreater + globalNumberOfElementsLower)));
+                                                  /(globalNumberOfElementsGreater + globalNumberOfElementsLower)));
 
-            FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] procInTheMiddle = " << procInTheMiddle << "\n"; )
-                    FLOG( FLog::Controller.flush());
+            FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] procInTheMiddle = " << procInTheMiddle << "\n"; )
+                    FLOG(if(VerboseLog)  FLog::Controller.flush());
 
             // Send or receive depending on the state
             if(currentRank <= procInTheMiddle){
@@ -411,11 +403,11 @@ public:
                 workingArray = fullLowerPart;
                 currentSize = fullNbLowerElementsRecv;
                 // Reduce working group
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Reduce group to " << 0 << " / " << procInTheMiddle << "\n"; )
-                        FLOG( FLog::Controller.flush());
+                FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Reduce group to " << 0 << " / " << procInTheMiddle << "\n"; )
+                        FLOG(if(VerboseLog)  FLog::Controller.flush());
                 currentComm.groupReduce( 0, procInTheMiddle);
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Done\n" );
-                FLOG( FLog::Controller.flush());
+                FLOG(if(VerboseLog)  FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Done\n" );
+                FLOG(if(VerboseLog)  FLog::Controller.flush());
             }
             else {
                 // I am in the group of the greater elements
@@ -435,16 +427,16 @@ public:
                 workingArray = fullGreaterPart;
                 currentSize = fullNbGreaterElementsRecv;
                 // Reduce working group
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Reduce group to " << procInTheMiddle + 1 << " / " << currentNbProcs - 1 << "\n"; )
-                        FLOG( FLog::Controller.flush());
+                FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Reduce group to " << procInTheMiddle + 1 << " / " << currentNbProcs - 1 << "\n"; )
+                        FLOG( if(VerboseLog) FLog::Controller.flush());
                 currentComm.groupReduce( procInTheMiddle + 1, currentNbProcs - 1);
-                FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Done\n"; )
-                        FLOG( FLog::Controller.flush());
+                FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Done\n"; )
+                        FLOG( if(VerboseLog) FLog::Controller.flush());
             }
         }
 
-        FLOG( FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Sequential sort (currentSize = " << currentSize << ")\n"; )
-                FLOG( FLog::Controller.flush());
+        FLOG( if(VerboseLog) FLog::Controller << "SCALFMM-DEBUG ["  << currentComm.processId() << "] Sequential sort (currentSize = " << currentSize << ")\n"; )
+        FLOG( if(VerboseLog) FLog::Controller.flush());
         // Finish by a local sort
         FQuickSort< SortType, IndexType>::QsOmp(workingArray, currentSize, [](const SortType& v1, const SortType& v2){
             return CompareType(v1) <= CompareType(v2);
@@ -454,4 +446,10 @@ public:
     }
 };
 
+
+#ifdef SCALFMM_USE_LOG
+template <class SortType, class CompareType, class IndexType>
+const bool FQuickSortMpi<SortType, CompareType, IndexType>::VerboseLog = FEnv::GetBool("SCALFMM_DEBUG_LOG", false);
+#endif
+
 #endif // FQUICKSORTMPI_HPP
diff --git a/Tests/CMakeLists.txt b/Tests/CMakeLists.txt
index 0834489118c47241b822ccb1dc84da2a1e1e2f8d..9616ae3108e75d83610f8f39f1c9bb2f2983a3df 100644
--- a/Tests/CMakeLists.txt
+++ b/Tests/CMakeLists.txt
@@ -17,8 +17,8 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Src    
-     ${CMAKE_SOURCE_DIR}/Src   
+     ${SCALFMM_BINARY_DIR}/Src    
+     ${SCALFMM_SOURCE_DIR}/Src   
      ${SCALFMM_INCLUDES}
 )
 
diff --git a/Tests/GroupTree/testBlockedRotationCuda.cpp b/Tests/GroupTree/testBlockedRotationCuda.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..fe4b36165f3c61af9f4cdbe14819c2ce76d4fa62
--- /dev/null
+++ b/Tests/GroupTree/testBlockedRotationCuda.cpp
@@ -0,0 +1,202 @@
+// ==== CMAKE =====
+// @FUSE_BLAS
+// @FUSE_STARPU
+// @FUSE_CUDA
+// ================
+// Keep in private GIT
+
+
+#include "../../Src/Utils/FGlobal.hpp"
+
+#include "../../Src/GroupTree/Core/FGroupTree.hpp"
+
+#include "../../Src/Components/FSimpleLeaf.hpp"
+#include "../../Src/Containers/FVector.hpp"
+
+#include "../../Src/Kernels/P2P/FP2PParticleContainer.hpp"
+
+#include "../../Src/Kernels/Rotation/FRotationKernel.hpp"
+
+#include "../../Src/GroupTree/Rotation/FRotationCellPOD.hpp"
+
+#include "../../Src/Utils/FMath.hpp"
+#include "../../Src/Utils/FMemUtils.hpp"
+#include "../../Src/Utils/FParameters.hpp"
+
+#include "../../Src/Files/FRandomLoader.hpp"
+#include "../../Src/Files/FFmaGenericLoader.hpp"
+
+#include "../../Src/GroupTree/Core/FGroupSeqAlgorithm.hpp"
+#include "../../Src/GroupTree/Core/FGroupTaskAlgorithm.hpp"
+
+#include "../../Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp"
+#include "../../Src/GroupTree/StarPUUtils/FStarPUKernelCapacities.hpp"
+
+#include "../../Src/GroupTree/Core/FP2PGroupParticleContainer.hpp"
+
+#include "../../Src/GroupTree/Cuda/FCudaDeviceWrapper.hpp"
+#include "../../Src/GroupTree/Cuda/FCudaEmptyCellSymb.hpp"
+#include "../../Src/GroupTree/Cuda/FCudaGroupOfParticles.hpp"
+#include "../../Src/GroupTree/Cuda/FCudaGroupOfCells.hpp"
+
+
+#include "../../Src/Utils/FParameterNames.hpp"
+
+#include <memory>
+
+template <class FReal>
+class FCudaP2P;
+
+#define RANDOM_PARTICLES
+
+int main(int argc, char* argv[]){
+    const FParameterNames LocalOptionBlocSize { {"-bs"}, "The size of the block of the blocked tree"};
+    const FParameterNames LocalOptionNoValidate { {"-no-validation"}, "To avoid comparing with direct computation"};
+    FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
+                         FParameterDefinitions::OctreeHeight,
+#ifdef RANDOM_PARTICLES
+                         FParameterDefinitions::NbParticles,
+#else
+                         FParameterDefinitions::InputFile,
+#endif
+                         FParameterDefinitions::NbThreads,
+                         LocalOptionBlocSize, LocalOptionNoValidate);
+
+    // Initialize the types
+    typedef double FReal;
+    static const int ORDER = 6;
+
+    typedef FRotationCellPODCore         GroupCellSymbClass;
+    typedef FRotationCellPODPole<FReal,ORDER>  GroupCellUpClass;
+    typedef FRotationCellPODLocal<FReal,ORDER> GroupCellDownClass;
+    typedef FRotationCellPOD<FReal,ORDER>      GroupCellClass;
+
+    typedef FP2PGroupParticleContainer<FReal>          GroupContainerClass;
+    typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupContainerClass, 1, 4, FReal>  GroupOctreeClass;
+
+    typedef FStarPUCudaP2PCapacities<FRotationKernel<FReal,GroupCellClass,GroupContainerClass,ORDER>> GroupKernelClass;
+    typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
+
+    typedef FStarPUCudaWrapper<GroupKernelClass,
+            FCudaEmptyCellSymb, int, int,
+            FCudaGroupOfCells<FCudaEmptyCellSymb, int, int>,
+            FCudaGroupOfParticles<FReal, 1, 4, FReal>, FCudaGroupAttachedLeaf<FReal, 1, 4, FReal>, FCudaP2P<FReal> > GroupCudaWrapper;
+
+    typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass,
+            GroupCpuWrapper, GroupCudaWrapper > GroupAlgorithm;
+
+    // Get params
+    const int NbLevels      = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5);
+    const int groupSize     = FParameters::getValue(argc,argv,LocalOptionBlocSize.options, 250);
+
+    // Load the particles
+#ifdef RANDOM_PARTICLES
+    FRandomLoader<FReal> loader(FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, 2000), 1.0, FPoint<FReal>(0,0,0), 0);
+#else
+    const char* const filename = FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/test20k.fma");
+    FFmaGenericLoader<FReal> loader(filename);
+#endif
+    FAssertLF(loader.isOpen());
+    FTic timer;
+
+    FP2PParticleContainer<FReal> allParticles;
+    for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+        FPoint<FReal> particlePosition;
+        FReal physicalValue;
+#ifdef RANDOM_PARTICLES
+        physicalValue = 0.10;
+        loader.fillParticle(&particlePosition);
+#else
+        loader.fillParticle(&particlePosition, &physicalValue);
+#endif
+        allParticles.push(particlePosition, physicalValue);
+    }
+    std::cout << "Particles loaded in " << timer.tacAndElapsed() << "s\n";
+
+    // Put the data into the tree
+    timer.tic();
+    GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles);
+    groupedTree.printInfoBlocks();
+    std::cout << "Tree created in " << timer.tacAndElapsed() << "s\n";
+
+    // Run the algorithm
+    GroupKernelClass groupkernel(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox());
+    GroupAlgorithm groupalgo(&groupedTree,&groupkernel);
+
+    timer.tic();
+    groupalgo.execute();
+    std::cout << "Kernel executed in in " << timer.tacAndElapsed() << "s\n";
+
+    // Validate the result
+    if(FParameters::existParameter(argc, argv, LocalOptionNoValidate.options) == false){
+        FSize offsetParticles = 0;
+        FReal*const allPhysicalValues = allParticles.getPhysicalValues();
+        FReal*const allPosX = const_cast<FReal*>( allParticles.getPositions()[0]);
+        FReal*const allPosY = const_cast<FReal*>( allParticles.getPositions()[1]);
+        FReal*const allPosZ = const_cast<FReal*>( allParticles.getPositions()[2]);
+
+        groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){
+            const FReal*const physicalValues = leafTarget->getPhysicalValues();
+            const FReal*const posX = leafTarget->getPositions()[0];
+            const FReal*const posY = leafTarget->getPositions()[1];
+            const FReal*const posZ = leafTarget->getPositions()[2];
+            const FSize nbPartsInLeafTarget = leafTarget->getNbParticles();
+
+            for(FSize idxPart = 0 ; idxPart < nbPartsInLeafTarget ; ++idxPart){
+                allPhysicalValues[offsetParticles + idxPart] = physicalValues[idxPart];
+                allPosX[offsetParticles + idxPart] = posX[idxPart];
+                allPosY[offsetParticles + idxPart] = posY[idxPart];
+                allPosZ[offsetParticles + idxPart] = posZ[idxPart];
+            }
+
+            offsetParticles += nbPartsInLeafTarget;
+        });
+
+        FAssertLF(offsetParticles == loader.getNumberOfParticles());
+
+        FReal*const allDirectPotentials = allParticles.getPotentials();
+        FReal*const allDirectforcesX = allParticles.getForcesX();
+        FReal*const allDirectforcesY = allParticles.getForcesY();
+        FReal*const allDirectforcesZ = allParticles.getForcesZ();
+
+        for(int idxTgt = 0 ; idxTgt < offsetParticles ; ++idxTgt){
+            for(int idxMutual = idxTgt + 1 ; idxMutual < offsetParticles ; ++idxMutual){
+                FP2PR::MutualParticles(
+                    allPosX[idxTgt],allPosY[idxTgt],allPosZ[idxTgt], allPhysicalValues[idxTgt],
+                    &allDirectforcesX[idxTgt], &allDirectforcesY[idxTgt], &allDirectforcesZ[idxTgt], &allDirectPotentials[idxTgt],
+                    allPosX[idxMutual],allPosY[idxMutual],allPosZ[idxMutual], allPhysicalValues[idxMutual],
+                    &allDirectforcesX[idxMutual], &allDirectforcesY[idxMutual], &allDirectforcesZ[idxMutual], &allDirectPotentials[idxMutual]
+                );
+            }
+        }
+
+        FMath::FAccurater<FReal> potentialDiff;
+        FMath::FAccurater<FReal> fx, fy, fz;
+        offsetParticles = 0;
+        groupedTree.forEachCellLeaf<FP2PGroupParticleContainer<FReal> >([&](GroupCellClass cellTarget, FP2PGroupParticleContainer<FReal> * leafTarget){
+            const FReal*const potentials = leafTarget->getPotentials();
+            const FReal*const forcesX = leafTarget->getForcesX();
+            const FReal*const forcesY = leafTarget->getForcesY();
+            const FReal*const forcesZ = leafTarget->getForcesZ();
+            const FSize nbPartsInLeafTarget = leafTarget->getNbParticles();
+
+            for(int idxTgt = 0 ; idxTgt < nbPartsInLeafTarget ; ++idxTgt){
+                potentialDiff.add(allDirectPotentials[idxTgt + offsetParticles], potentials[idxTgt]);
+                fx.add(allDirectforcesX[idxTgt + offsetParticles], forcesX[idxTgt]);
+                fy.add(allDirectforcesY[idxTgt + offsetParticles], forcesY[idxTgt]);
+                fz.add(allDirectforcesZ[idxTgt + offsetParticles], forcesZ[idxTgt]);
+            }
+
+            offsetParticles += nbPartsInLeafTarget;
+        });
+
+        std::cout << "Error : Potential " << potentialDiff << "\n";
+        std::cout << "Error : fx " << fx << "\n";
+        std::cout << "Error : fy " << fy << "\n";
+        std::cout << "Error : fz " << fz << "\n";
+    }
+
+    return 0;
+}
+
+
diff --git a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
index 07015cfa8232431bddcfe78a771817f07437114e..522418cb77d05623f62cc10ed0d6808a13f863d4 100644
--- a/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
+++ b/Tests/GroupTree/testBlockedWithCudaAlgorithm.cpp
@@ -80,7 +80,7 @@ int main(int argc, char* argv[]){
     typedef FGroupTestParticleContainer<FReal>                                     GroupContainerClass;
     typedef FGroupTree< FReal, GroupCellClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,
             GroupContainerClass, 0, 1, long long int>  GroupOctreeClass;
-    typedef FStarPUAllCpuCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >>  GroupKernelClass;
+    typedef FStarPUAllCudaCapacities<FTestKernels< GroupCellClass, GroupContainerClass >>  GroupKernelClass;
 
     typedef FStarPUCpuWrapper<typename GroupOctreeClass::CellGroupClass, GroupCellClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass> GroupCpuWrapper;
     typedef FStarPUCudaWrapper<GroupKernelClass, GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass,
diff --git a/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp b/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp
index 16b3c4916f117f39c6d54090b37c2304309c0857..264467596eab55dcffcf462ba13eb8c9253af888 100644
--- a/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp
+++ b/Tests/GroupTree/testBlockedWithOpenCLAlgorithm.cpp
@@ -44,13 +44,15 @@
 #include "../../Src/GroupTree/OpenCl/FOpenCLDeviceWrapper.hpp"
 
 int main(int argc, char* argv[]){
-    setenv("STARPU_NCPU","0",1);
-    setenv("STARPU_NOPENCL","1",1);
-    //setenv("STARPU_OPENCL_ONLY_ON_CPUS","1",1);
-    setenv("STARPU_OPENCL_ON_CPUS","1",1);
-
-    setenv("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY","1",1);
-    setenv("STARPU_OPENCL_PIPELINE","0",0); // synchronous task
+    if(getenv("HOSTNAME") && strcmp(getenv("HOSTNAME"),"berenger-HP-ProBook-640-G1") == 0){
+        setenv("STARPU_NCPU","0",1);
+        setenv("STARPU_NOPENCL","1",1);
+        setenv("STARPU_OPENCL_ONLY_ON_CPUS","1",1);
+        setenv("STARPU_OPENCL_ON_CPUS","1",1);
+
+        setenv("STARPU_DISABLE_ASYNCHRONOUS_OPENCL_COPY","1",1);
+        setenv("STARPU_OPENCL_PIPELINE","0",0); // synchronous task
+    }
 
     const FParameterNames LocalOptionBlocSize {
         {"-bs"},
diff --git a/Tests/Kernels/testRotationAlgorithmProc.cpp b/Tests/Kernels/testRotationAlgorithmProc.cpp
index 49d51dc14933092eef79049927cf4b2c1f7ad2d6..22613ef15926552f3c6e37839710749d26f7a147 100644
--- a/Tests/Kernels/testRotationAlgorithmProc.cpp
+++ b/Tests/Kernels/testRotationAlgorithmProc.cpp
@@ -115,8 +115,8 @@ int main(int argc, char* argv[])
                                                                 tree.getBoxWidth(),tree.getHeight(),
                                                                 &finalParticles, &balancer);
     { // -----------------------------------------------------
-        std::cout << "Creating & Inserting " << loader.getNumberOfParticles() << " particles ..." << std::endl;
-        std::cout << "For a total of " << loader.getNumberOfParticles() * app.global().processCount() << " particles ..." << std::endl;
+        std::cout << app.global().processId() << "] Creating & Inserting " << finalParticles.getSize()  << " particles ..." << std::endl;
+        std::cout << app.global().processId() << "] For a total of " << loader.getNumberOfParticles() * app.global().processCount() << " particles ..." << std::endl;
         std::cout << "\tHeight : " << TreeHeight << " \t sub-height : " << SubTreeHeight << std::endl;
         time.tic();
 
@@ -126,8 +126,17 @@ int main(int argc, char* argv[])
         }
 
         time.tac();
-        std::cout << "Done  " << "(@Creating and Inserting Particles = "
+        std::cout << app.global().processId() << "] Done  " << "(@Creating and Inserting Particles = "
                   << time.elapsed() << "s)." << std::endl;
+
+        FSize minPart = std::numeric_limits<FSize>::max();
+        FSize maxPart = std::numeric_limits<FSize>::min();
+        tree.forEachLeaf([&](LeafClass* lf){
+            minPart = FMath::Min(lf->getSrc()->getNbParticles(), minPart);
+            maxPart = FMath::Max(lf->getSrc()->getNbParticles(), maxPart);
+        });
+
+        std::cout << app.global().processId() << "] Min nb part " << minPart << " Max nb part " << maxPart << std::endl;
     } // -----------------------------------------------------
 
     delete[] particles;
@@ -139,13 +148,14 @@ int main(int argc, char* argv[])
         KernelClass kernels(TreeHeight, loader.getBoxWidth(), loader.getCenterOfBox());
         FmmClass algorithm(app.global(),&tree, &kernels);
         time.tac();
-        std::cout << "Done  " << "(@Init = " << time.elapsed() << "s)." << std::endl;
+        std::cout << app.global().processId() << "] Done  " << "(@Init = " << time.elapsed() << "s)." << std::endl;
         time.tic();
         algorithm.execute();
         time.tac();
-        std::cout << "Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
+        std::cout << app.global().processId() << "] Done  " << "(@Algorithm = " << time.elapsed() << "s)." << std::endl;
     } // -----------------------------------------------------
 
+    app.global().barrier();
 
     return 0;
 }
diff --git a/Tests/noDist/testParticlesDistrMpi.cpp b/Tests/noDist/testParticlesDistrMpi.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..41ef6b67bd253e4dc627ccb22473055f601dea9d
--- /dev/null
+++ b/Tests/noDist/testParticlesDistrMpi.cpp
@@ -0,0 +1,152 @@
+// ===================================================================================
+// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas
+// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
+// This software is a computer program whose purpose is to compute the FMM.
+//
+// This software is governed by the CeCILL-C and LGPL licenses and
+// abiding by the rules of distribution of free software.
+//
+// This program is distributed in the hope that it will be useful,
+// but WITHOUT ANY WARRANTY; without even the implied warranty of
+// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+// GNU General Public and CeCILL-C Licenses for more details.
+// "http://www.cecill.info".
+// "http://www.gnu.org/licenses".
+// ===================================================================================
+
+// ==== CMAKE =====
+// @FUSE_MPI
+// ================
+
+#include <iostream>
+
+#include <cstdio>
+#include <cstdlib>
+
+
+#include "../../Src/Kernels/Rotation/FRotationCell.hpp"
+#include "../../Src/Kernels/Rotation/FRotationKernel.hpp"
+
+#include "../../Src/Components/FSimpleLeaf.hpp"
+#include "../../Src/Kernels/P2P/FP2PParticleContainerIndexed.hpp"
+
+#include "../../Src/Utils/FParameters.hpp"
+#include "../../Src/Utils/FMemUtils.hpp"
+
+#include "../../Src/Containers/FOctree.hpp"
+#include "../../Src/Containers/FVector.hpp"
+
+#include "../../Src/Files/FRandomLoader.hpp"
+#include "../../Src/Files/FMpiTreeBuilder.hpp"
+
+#include "../../Src/Core/FFmmAlgorithm.hpp"
+#include "../../Src/Core/FFmmAlgorithmThread.hpp"
+#include "../../Src/Core/FFmmAlgorithmThreadProc.hpp"
+
+#include "../../Src/BalanceTree/FLeafBalance.hpp"
+
+#include "../../Src/Utils/FParameterNames.hpp"
+
+/**
+ * This program runs the FMM Algorithm Distributed with the Rotation kernel
+ */
+
+// Simply create particles and try the kernels
+int main(int argc, char* argv[])
+{
+    FHelpDescribeAndExit(argc, argv,
+                         "Test with MPI the chebyshev FMM and compare it to the direct computation for debugging purpose.",
+                         FParameterDefinitions::NbParticles, FParameterDefinitions::OctreeHeight,
+                         FParameterDefinitions::OctreeSubHeight, FParameterDefinitions::NbThreads);
+
+    typedef double FReal;
+
+
+    FMpi app(argc,argv);
+
+    const FSize nbParticles       = FParameters::getValue(argc,argv, FParameterDefinitions::NbParticles.options, 10000000ULL);
+    const unsigned int TreeHeight    = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5);
+    FTic time;
+
+    std::cout << ">> This executable has to be used to test Proc Rotation Algorithm. \n";
+
+    // init particles position and physical value
+    struct TestParticle{
+        FSize idxPart;
+        FPoint<FReal> position;
+        FReal physicalValue;
+        const FPoint<FReal>& getPosition(){
+            return position;
+        }
+    };
+
+    // open particle file
+    std::cout << "Creating : " << nbParticles << "\n" << std::endl;
+
+    time.tic();
+    const FSize totalNbParticles = nbParticles*app.global().processCount();
+    TestParticle* particles = new TestParticle[totalNbParticles];
+    memset(particles,0,(unsigned int) (sizeof(TestParticle)*totalNbParticles));
+    for(int idxProc = 0 ; idxProc < app.global().processCount() ; ++idxProc){
+        FRandomLoader<FReal> loader(nbParticles, 1.0, FPoint<FReal>(0,0,0), idxProc);
+        for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){
+            loader.fillParticle(&particles[idxPart + idxProc*nbParticles].position);
+            particles[idxPart + idxProc*nbParticles].physicalValue = 1.0;
+            particles[idxPart + idxProc*nbParticles].idxPart = idxPart + idxProc*nbParticles;
+        }
+    }
+
+    FVector<TestParticle> finalParticles;
+    FLeafBalance balancer;
+    FMpiTreeBuilder< FReal,TestParticle >::DistributeArrayToContainer(app.global(),&particles[app.global().processId()*nbParticles],
+                                                                nbParticles,
+                                                                FPoint<FReal>(0,0,0),
+                                                                1.0,TreeHeight,
+                                                                &finalParticles, &balancer);
+
+    app.global().barrier();
+
+    std::cout << "Testing : " << finalParticles.getSize()  << "\n" << std::endl;
+
+    for(FSize idxRes = 0 ; idxRes < finalParticles.getSize() ; ++idxRes){
+        FAssertLF(0 <= finalParticles[idxRes].idxPart, "idxRes ", idxRes, " finalParticles[idxRes].idxPart ", finalParticles[idxRes].idxPart);
+        FAssertLF(finalParticles[idxRes].idxPart < totalNbParticles, "idxRes ", idxRes, " finalParticles[idxRes].idxPart ", finalParticles[idxRes].idxPart);
+
+        const TestParticle correctPart = particles[finalParticles[idxRes].idxPart];
+        const TestParticle testPart = finalParticles[idxRes];
+
+        FAssertLF(testPart.idxPart == correctPart.idxPart);
+        FAssertLF(testPart.position.getX() == correctPart.position.getX());
+        FAssertLF(testPart.position.getY() == correctPart.position.getY());
+        FAssertLF(testPart.position.getZ() == correctPart.position.getZ());
+        FAssertLF(testPart.physicalValue == correctPart.physicalValue);
+    }
+
+    std::cout << "Done\n" << std::endl;
+
+    app.global().barrier();
+
+    std::unique_ptr<int[]> particlesExist(new int[totalNbParticles]);
+    memset(particlesExist.get(), 0, sizeof(int)*totalNbParticles);
+
+    for(FSize idxRes = 0 ; idxRes < finalParticles.getSize() ; ++idxRes){
+        FAssertLF(particlesExist[finalParticles[idxRes].idxPart] == 0);
+        particlesExist[finalParticles[idxRes].idxPart] = 1;
+    }
+
+    std::unique_ptr<int[]> particlesReduced(new int[totalNbParticles]);
+    memset(particlesReduced.get(), 0, sizeof(int)*totalNbParticles);
+
+    FAssert(totalNbParticles <= std::numeric_limits<int>::max());
+    FMpi::Assert(MPI_Allreduce(particlesExist.get(), particlesReduced.get(), int(totalNbParticles),
+                            MPI_INT, MPI_SUM,
+                            app.global().getComm()), __LINE__);
+
+    for(FSize idxPart = 0 ; idxPart < totalNbParticles ; ++idxPart){
+        FAssertLF(particlesReduced[idxPart] == 1, idxPart, " " , particlesReduced[idxPart]);
+    }
+
+    return 0;
+}
+
+
diff --git a/UTests/CMakeLists.txt b/UTests/CMakeLists.txt
index 3e3d0fdfaff0efacee1e02a3cf6617c6bfc7d3b6..56f2164f315875134d858d7f51760b21b0175fa3 100644
--- a/UTests/CMakeLists.txt
+++ b/UTests/CMakeLists.txt
@@ -57,8 +57,8 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Src 
-     ${CMAKE_SOURCE_DIR}/Src   
+     ${SCALFMM_BINARY_DIR}/Src 
+     ${SCALFMM_SOURCE_DIR}/Src   
      ${SCALFMM_INCLUDES}
 )
 
diff --git a/Utils/CMakeLists.txt b/Utils/CMakeLists.txt
index 5cf190484b93d0a1564a3241a39c971d04da08c9..681a92634ce90e8d85236aa69ed561ba27dedd12 100644
--- a/Utils/CMakeLists.txt
+++ b/Utils/CMakeLists.txt
@@ -17,8 +17,8 @@ file(
 
 # Adding the project sources dir as an include dir
 INCLUDE_DIRECTORIES(
-     ${CMAKE_BINARY_DIR}/Src 
-     ${CMAKE_SOURCE_DIR}/Src
+     ${SCALFMM_BINARY_DIR}/Src 
+     ${SCALFMM_SOURCE_DIR}/Src
      ${SCALFMM_INCLUDES}
 )