diff --git a/.guix b/.guix
new file mode 120000
index 0000000000000000000000000000000000000000..546e858b945fcbce5abd6d70b1132e17a9dc8e15
--- /dev/null
+++ b/.guix
@@ -0,0 +1 @@
+guix-tools/
\ No newline at end of file
diff --git a/CMakePresets.json b/CMakePresets.json
index 4c642c2dd1ae088a27b652fdf2935861c9dc43c4..f9c010ce1671e583f68e98e12ca34d8ef85b1d74 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -15,7 +15,8 @@
 	    "cacheVariables": {
 		"CMAKE_BUILD_TYPE": "Release",
 		"CMAKE_CXX_FLAGS": "-O3 -march=native",
-		"scalfmm_BUILD_UNITS": true
+		"scalfmm_BUILD_UNITS": true,
+		"scalfmm_USE_MPI": true
 	    }
 	},
 	{
@@ -73,11 +74,11 @@
 	},
 	{
 	    "name": "sequential",
-            "hidden": true,
+	    "hidden": true,
 	    "inherits": "base",
 	    "filter": {
 		"exclude": {
-		"name": "_omp$"
+		    "name": "_omp$|_mpi$"
 		}
 	    }
 	},
@@ -91,6 +92,16 @@
 		}
 	    }
 	},
+	{
+	    "name": "mpi",
+            "hidden": true,
+	    "inherits": "base",
+	    "filter": {
+		"include": {
+		"name": "_mpi$"
+		}
+	    }
+	},
 	{
 	    "name": "test-default",
 	    "inherits": "base",
@@ -132,6 +143,20 @@
 	    "displayName": "Run OpenMP tests (MKL)",
 	    "description": "Run only the OpenMP tests with the MKL",
 	    "configurePreset": "mkl"
+	},
+	{
+	    "name": "test-default-mpi",
+	    "inherits": "mpi",
+	    "displayName": "Run MPI tests (OpenBLAS)",
+	    "description": "Run only the MPI tests with OpenBLAS",
+	    "configurePreset": "default"
+	},
+	{
+	    "name": "test-mkl-mpi",
+	    "inherits": "mpi",
+	    "displayName": "Run MPI tests (MKL)",
+	    "description": "Run only the MPI tests with the MKL",
+	    "configurePreset": "mkl"
 	}
     ]
 }
diff --git a/README.md b/README.md
index 2b6283641677d570fc9f429c5534f1f37b4a4992..df3e5d7ca8ded55aaaa079eb25e2151328bd50a3 100644
--- a/README.md
+++ b/README.md
@@ -110,7 +110,7 @@ and to compile
 ```bash
 cd /path/to/build/
 # Use cmake, with relevant options
-cmake  -DCMAKE_CXX_FLAGS= `-Xclang -fopenmp`  -S ../
+cmake  -DCMAKE_CXX_FLAGS=`-Xclang -fopenmp`  -S ../ 
 ```
 #### Optimization
 
@@ -275,6 +275,14 @@ guix time-machine -C .guix/scalfmm-channels.scm -- shell -C -m .guix/scalfmm-man
 
 We provide several manifest files: `scalfmm-manifest-openblas.scm`, `scalfmm-manifest-mkl.scm`.
 
+### Build the documentation with guix
+
+TODO: provide instructions to build the documentations using `guix time-machine` and `guix shell`
+
+``` bash
+guix time-machine -C .guix/scalfmm-channels.scm -- shell -C -m .guix/scalfmm-manifest-doc.scm -L .guix/
+```
+
 ## Contributing and development guidelines
 
 ### Gitlab flow
diff --git a/checks/CMakeLists.txt b/checks/CMakeLists.txt
index 80a382d34a103749319c1225ad46f2c8f1445bf9..85a49ab41189e8bb61db9fdff4c56fa26edc4263 100644
--- a/checks/CMakeLists.txt
+++ b/checks/CMakeLists.txt
@@ -52,10 +52,7 @@ endif()
 if(${CMAKE_PROJECT_NAME}_USE_MPI)
     list(APPEND source_check_files
         test_build_let.cpp
-
-        # check_mpi.cpp
         count_particles_mpi.cpp
-
         # test_compose.cpp
     )
     # message(WARNING "source_check_files ")
diff --git a/checks/check_1d.cpp b/checks/check_1d.cpp
index d4dc4a8c58254c575e52b393a12adece890ec86f..ff1438e0bb9e439f488be7129548d888c27bd0fd 100644
--- a/checks/check_1d.cpp
+++ b/checks/check_1d.cpp
@@ -57,9 +57,9 @@ namespace local_args
 {
     struct matrix_kernel
     {
-        cpp_tools::cl_parser::str_vec flags = {"--kernel", "--k"};
+        cpp_tools::cl_parser::str_vec flags = {"--kernel", "-k"};
         std::string description = "Matrix kernels: \n   0) 1/r, 2) 1/r^2, "
-                                  "2)  shift(ln r)-> grad  3) val_grad( 1/r)";
+                                  "2)  val_grad( 1/r)";
         using type = int;
         type def = 0;
     };
@@ -244,6 +244,9 @@ auto run(const int& tree_height, const int& group_size, const std::size_t order,
     auto total_height = tree_height;
     interpolator_type interpolator(mk_far, order, total_height, box_width);
 
+    // auto memory = interpolator.memory_usage();
+    // std::cout << "memory " << memory << std::endl;
+
     typename FMM_OPERATOR_TYPE::far_field_type far_field(interpolator);
     //
     near_matrix_kernel_type mk_near{};
@@ -325,58 +328,28 @@ auto run_general(const int& tree_height, const int& group_size, const std::size_
         return run<Dimension, value_type, fmm_operators_type>(tree_height, group_size, order, input_file, output_file,
                                                               check, displayCells, displayParticles);
     }
+    else if(kernel == 2)
+    {   // val_grad_one_over_r
+        using far_matrix_kernel_type = scalfmm::matrix_kernels::laplace::val_grad_one_over_r<1>;
+        using near_matrix_kernel_type = scalfmm::matrix_kernels::laplace::val_grad_one_over_r<1>;
+        using near_field_type = scalfmm::operators::near_field_operator<near_matrix_kernel_type>;
+        //
+        using interpolation_type = interpolator_alias<double, Dimension, far_matrix_kernel_type>;
+
+        // using interpolation_type =
+        //   scalfmm::interpolation::uniform_interpolator<value_type, Dimension, far_matrix_kernel_type>;
+        using far_field_type = scalfmm::operators::far_field_operator<interpolation_type, false>;
+
+        using fmm_operators_type = scalfmm::operators::fmm_operators<near_field_type, far_field_type>;
+
+        return run<Dimension, value_type, fmm_operators_type>(tree_height, group_size, order, input_file, output_file,
+                                                              check, displayCells, displayParticles);
+    }
     else
     {
-        return 0;
+        std::cout << " kernel < 4" << std::endl;
+        std::exit(EXIT_FAILURE);
     }
-    // else if (kernel == 2)
-    //     { // shift_ln_r
-    //         using far_matrix_kernel_type =
-    //             scalfmm::matrix_kernels::laplace::ln_2d;
-    //         using near_matrix_kernel_type =
-    //             scalfmm::matrix_kernels::laplace::grad_ln_2d;
-    //         using near_field_type = scalfmm::operators::near_field_operator<
-    //             near_matrix_kernel_type>;
-    //         //
-    //         using interpolation_type =
-    //             scalfmm::interpolation::uniform_interpolator<
-    //                 value_type, Dimension, far_matrix_kernel_type>;
-    //         using far_field_type =
-    //             scalfmm::operators::far_field_operator<interpolation_type,
-    //                                                    true>;
-
-    //         using fmm_operators_type =
-    //             scalfmm::operators::fmm_operators<near_field_type,
-    //                                               far_field_type>;
-
-    //         return run<Dimension, value_type, fmm_operators_type>(
-    //             tree_height, group_size, order, input_file, output_file, check,
-    //             displayCells, displayParticles);
-    //     }
-    // else if (kernel == 3)
-    //     { // val_grad_one_over_r
-    //         using far_matrix_kernel_type =
-    //             scalfmm::matrix_kernels::laplace::val_grad_one_over_r<2>;
-    //         using near_matrix_kernel_type =
-    //             scalfmm::matrix_kernels::laplace::val_grad_one_over_r<2>;
-    //         using near_field_type = scalfmm::operators::near_field_operator<
-    //             near_matrix_kernel_type>;
-    //         //
-    //         using interpolation_type =
-    //             scalfmm::interpolation::uniform_interpolator<
-    //                 value_type, Dimension, far_matrix_kernel_type>;
-    //         using far_field_type =
-    //             scalfmm::operators::far_field_operator<interpolation_type,
-    //                                                    false>;
-
-    //         using fmm_operators_type =
-    //             scalfmm::operators::fmm_operators<near_field_type,
-    //                                               far_field_type>;
-
-    //         return run<Dimension, value_type, fmm_operators_type>(
-    //             tree_height, group_size, order, input_file, output_file, check,
-    //             displayCells, displayParticles);
-    return 0;
 }
 
 //   scalfmm::matrix_kernels::laplace::one_over_r;
@@ -426,24 +399,4 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     auto ret = run_general<dimension, value_type>(tree_height, group_size, order, kernel, input_file, output_file,
                                                   check, displayCells, displayParticles);
     return ret;
-}
-
-/*
-faux 3d
-
-0 p_tree [0.254212, 0.574017, 0] p_ref [0.254212, 0.574017, 0]
-(8108.38  18.5173) 0 p_tree [0.926114, 0.470606, 0] p_ref [0.926114, 0.470606,
-0](6937.27  7.06436 ) 0 p_tree [0.725386, 0.777877, 0] p_ref [0.725386,
-0.777877,0]   (4583.97  15.7301 ) 0 p_tree [0.411987, 0.622132, 0] p_ref
-[0.411987,0.622132, 0]   (9935.72  16.7617 )
-
-2d
-
-0 p_tree [0.307883, 0.668131] p_ref [0.307883, 0.668131]   (5414.29  13.8412 )
-0 p_tree [0.173692, 0.734691] p_ref [0.173692, 0.734691]   (4656.34  20.3212 )
-0 p_tree [0.254212, 0.574017] p_ref [0.254212, 0.574017]   (8108.38  18.5173 )
-0 p_tree [0.926114, 0.470606] p_ref [0.926114, 0.470606]   (6937.27  7.06436 )
-0 p_tree [0.725386, 0.777877] p_ref [0.725386, 0.777877]   (4583.97  15.7301 )
-0 p_tree [0.411987, 0.622132] p_ref [0.411987, 0.622132]   (9935.72  16.7617 )
-
-*/
+}
\ No newline at end of file
diff --git a/checks/check_2d.cpp b/checks/check_2d.cpp
index 667fc174491f8a35f91dcfa5dbe46dcb50f9cce4..7ca992d3f2d476eb2ccb00b824f4c5ce1cd5e2ac 100644
--- a/checks/check_2d.cpp
+++ b/checks/check_2d.cpp
@@ -251,13 +251,17 @@ auto run(const int& tree_height, const int& group_size, const std::size_t order,
 
     typename FMM_OPERATOR_TYPE::far_field_type far_field(interpolator);
     //
+    auto memory = interpolator.memory_usage();
+    std::cout << "memory " << memory << std::endl;
+    //
     near_matrix_kernel_type mk_near{};
-    const bool mutual_near = true;
+    const bool mutual_near = false;
 
     typename FMM_OPERATOR_TYPE::near_field_type near_field(mk_near, mutual_near);
     //
+
     std::cout << cpp_tools::colors::blue << "Fmm with kernels: " << std::endl
-              << "       near " << mk_near.name() << std::endl
+              << "       near " << mk_near.name() << " mutual " << std::boolalpha << near_field.mutual() << std::endl
               << "       far  " << mk_far.name() << std::endl
               << cpp_tools::colors::reset;
 
@@ -267,8 +271,11 @@ auto run(const int& tree_height, const int& group_size, const std::size_t order,
     int const& separation_criterion = fmm_operator.near_field().separation_criterion();
     bool const& mutual = fmm_operator.near_field().mutual();
     scalfmm::list::sequential::build_interaction_lists(tree, tree, separation_criterion, mutual);
+    // scalfmm::io::trace(std::cout, tree, 4);
 
     auto operator_to_proceed = scalfmm::algorithms::all;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
+    // auto operator_to_proceed = scalfmm::algorithms::nearfield;
     // auto operator_to_proceed = scalfmm::algorithms::p2m | scalfmm::algorithms::m2m | scalfmm::algorithms::m2l;
 
     std::cout << cpp_tools::colors::blue << "operator_to_proceed: ";
@@ -301,22 +308,22 @@ auto run(const int& tree_height, const int& group_size, const std::size_t order,
     }
     ////////////////////////////////////////////////////////////////////////////
     std::cout << "Save Tree\n";
-    std::string outName("saveTree.bin");
-    std::string header("Uniform FFT ");
+    std::string outName("tree_check2d_h" + std::to_string(tree_height) + ".bin");
+    std::string header("chebyshev - Low-rank ");
     scalfmm::tools::io::save(outName, tree, header);
-    //
-    std::cout << "Read Tree\n";
-    value_type eps{1.e-8};
-    auto tree1 = scalfmm::tools::io::read<group_tree_type>(outName);
-
-    if(scalfmm::utils::compare_two_trees(tree, tree1, eps, 3))
-    {
-        std::cout << "Same trees !\n";
-    }
-    else
-    {
-        std::cout << "Trees are different!\n";
-    }
+    // //
+    // std::cout << "Read Tree\n";
+    // value_type eps{1.e-8};
+    // auto tree1 = scalfmm::tools::io::read<group_tree_type>(outName);
+
+    // if(scalfmm::utils::compare_two_trees(tree, tree1, eps, 3))
+    // {
+    //     std::cout << "Same trees !\n";
+    // }
+    // else
+    // {
+    //     std::cout << "Trees are different!\n";
+    // }
     return 1;
 }
 
@@ -330,7 +337,7 @@ auto run_general(const int& tree_height, const int& group_size, const std::size_
     // using far_matrix_kernel_type =
     // scalfmm::matrix_kernels::others::one_over_r2; using
     // near_matrix_kernel_type = scalfmm::matrix_kernels::others::one_over_r2;
-    //  using options = scalfmm::options::uniform_<scalfmm::options::fft_>;
+    // using options = scalfmm::options::uniform_<scalfmm::options::fft_>;
     using options = scalfmm::options::chebyshev_<scalfmm::options::low_rank_>;
     // using options = scalfmm::options::chebyshev_<scalfmm::options::dense_>;
     if(kernel == 0)
diff --git a/checks/count_particles_gen.hpp b/checks/count_particles_gen.hpp
index 82be3ea72fb6d11858970f6d3ed7b3a6d2998a09..4abf785a57c2583bc224d3624fc886d128997d4d 100644
--- a/checks/count_particles_gen.hpp
+++ b/checks/count_particles_gen.hpp
@@ -218,18 +218,20 @@ auto run(const int& tree_height, const int& group_size, Array const& pbc, const
     // // fmm_operator_type fmm_operator{};
     using fmm_operator_type = scalfmm::operators::fmm_operators<count_kernels::particles::count_near_field,
                                                                 count_kernels::particles::count_far_field<Dimension>>;
-    bool mutual = false;
+    bool mutual = true;
 
     count_kernels::particles::count_near_field nf(mutual);
     count_kernels::particles::count_far_field<Dimension> ff{};
     fmm_operator_type fmm_operator(nf, ff);
-    //    auto operator_to_proceed = scalfmm::algorithms::all;
-    auto operator_to_proceed = scalfmm::algorithms::farfield;
+    auto operator_to_proceed = scalfmm::algorithms::all;
+    //  auto operator_to_proceed = scalfmm::algorithms::farfield;
     auto separation_criterion = fmm_operator.near_field().separation_criterion();
     scalfmm::list::sequential::build_interaction_lists(tree, tree, separation_criterion, mutual);
 
     // scalfmm::io::trace(std::cout, tree, 4);
-
+    std::cout << cpp_tools::colors::blue << "operator_to_proceed: ";
+    scalfmm::algorithms::print(operator_to_proceed);
+    std::cout << cpp_tools::colors::reset << std::endl;
 #ifdef COUNT_USE_OPENMP
     scalfmm::algorithms::fmm[scalfmm::options::_s(scalfmm::options::omp)](tree, fmm_operator, operator_to_proceed);
 #else
@@ -273,9 +275,9 @@ auto run(const int& tree_height, const int& group_size, Array const& pbc, const
         std::cout << "wrong number of particles - nb particles (min) " << nb_particles_min << "  (max) "
                   << nb_particles_max << " (expected) " << number_of_particles << std::endl;
 
-#ifdef SCALFMM_COUNT_KERNEL_SAVE_TREE
+#ifndef SCALFMM_COUNT_KERNEL_SAVE_TREE
         std::cout << "Save the Tree \n";
-        std::string outName("saveTreeSeq.bin");
+        std::string outName("tree_count_gen.bin");
         std::string header("count kernel seq ");
         scalfmm::tools::io::save(outName, tree, header);
 #endif
diff --git a/checks/count_particles_mpi.cpp b/checks/count_particles_mpi.cpp
index 0c719bd717fe562deefd55a690ea66918313e49f..4e3edfe4d0321981b71dc575de5935dc1d77953f 100644
--- a/checks/count_particles_mpi.cpp
+++ b/checks/count_particles_mpi.cpp
@@ -7,6 +7,7 @@
 #include "scalfmm/operators/count_kernel/count_kernel.hpp"
 //
 #include "scalfmm/interpolation/grid_storage.hpp"
+#include "scalfmm/meta/const_functions.hpp"
 #include "scalfmm/meta/utils.hpp"
 #include "scalfmm/tools/fma_dist_loader.hpp"
 #include "scalfmm/tools/tree_io.hpp"
@@ -54,15 +55,11 @@
 /// \endcode
 ///
 /// Examples
-///  * we count the number of particles from the input file
-/// \code
-/// count_particles_{omp,seq} --input-file ../data/unitCube_100_PF.fma  --tree-height  3 -gs 2 --dimension 3
-///  \endcode
 ///
 ///  * Here we generate one particle per leaf located at the center,
 ///      the number of particles = number of leaf = std::pow(2, dimension*(tree_height - 1))
 /// \code
-/// count_particles_{omp,seq} --tree-height  3 -gs 2 --dimension 2
+/// count_particles_mpi --tree-height  3 -gs 2 --dimension 2 --dist-part-leaf
 ///  \endcode
 
 /**
@@ -145,7 +142,7 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_he
          const bool interaction, bool use_leaf_distribution, bool use_particle_distribution) -> int
 {
     static constexpr std::size_t number_of_physical_values = 1;
-    static constexpr std::size_t dimpow2 = pow(2, Dimension);
+    static constexpr std::size_t dimpow2{scalfmm::meta::pow(2, Dimension)};
     const auto runtime_order = 1;
 
     int level_shared{2};
@@ -278,13 +275,15 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_he
     // separation criteria used to construct M2L | P2P ghosts
     int separation = 1;
     // Construct the LET
+    std::cout << "\n build let \n" << std::flush;
+
     auto letTree = scalfmm::tree::let::buildLetTree<group_tree_type>(
       para, number_of_particles, particles_set, box, leaf_level, level_shared, group_size, group_size, runtime_order,
       separation, use_leaf_distribution, use_particle_distribution);
 
     //    if(para.io_master())
     {
-        std::cout << cpp_tools::colors::blue << "Print tree distribution\n";
+        std::cout << cpp_tools::colors::blue << "Print tree distribution\n" << std::flush;
         letTree.print_distrib(std::cout);
         std::cout << "\n trace  2\n" << std::flush;
 
@@ -308,22 +307,28 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_he
     count_kernels::particles::count_near_field nf(mutual);
     count_kernels::particles::count_far_field<Dimension> ff{};
     fmm_operator_type fmm_operator(nf, ff);
-    std::cout << cpp_tools::colors::red << "build_interaction_lists \n" << cpp_tools::colors::reset << std::flush;
+    std::cout << cpp_tools::colors::blue << "Fmm with kernels: " << nf.matrix_kernel().name() << " mutual "
+              << std::boolalpha << cpp_tools::colors::reset;
 
     scalfmm::list::sequential::build_interaction_lists(letTree, letTree, separation_criterion, mutual);
-    std::cout << cpp_tools::colors::red << "trace \n" << cpp_tools::colors::reset << std::flush;
-    // if(para.io_master())
-    {
-        std::cout << cpp_tools::colors::red << "trace  4\n" << cpp_tools::colors::reset << std::flush;
-
-        scalfmm::io::trace(std::cout, letTree, 4);
-    }
-
-    //      auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
-    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::nearfield;
+    // std::cout << cpp_tools::colors::red << "trace \n" << cpp_tools::colors::reset << std::flush;
+    // // if(para.io_master())
+    //  {
+    //      std::cout << cpp_tools::colors::red << "trace  4\n" << cpp_tools::colors::reset << std::flush;
+
+    //      scalfmm::io::trace(std::cout, letTree, 4);
+    //      std::cout << "\n &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&& \n" << std::flush;
+
+    //      std::cout << std::flush;
+    //  }
+    //| scalfmm::algorithms::operators_to_proceed::m2l
+    //  auto operator_to_proceed =
+    //    scalfmm::algorithms::operators_to_proceed::p2p | scalfmm::algorithms::operators_to_proceed::p2m |
+    //    scalfmm::algorithms::operators_to_proceed::m2m | scalfmm::algorithms::operators_to_proceed::m2l |
+    //    scalfmm::algorithms::operators_to_proceed::l2l | scalfmm::algorithms::operators_to_proceed::l2p;
+    //  auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
+    //  auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::nearfield;
     auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::all;
-    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2l);
-    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2m  | scalfmm::algorithms::operators_to_proceed::m2l)  ;
 
     scalfmm::algorithms::mpi::proc_task(letTree, fmm_operator, operator_to_proceed);
     //
@@ -356,23 +361,52 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_he
           }
       });
     std::cout << cpp_tools::colors::reset << '\n';
-    if(right_number_of_particles)
-    {
-        std::cout << "Found the right number of particles - nb particles " << number_of_particles << std::endl;
-    }
-    else
+
+    // if(!right_number_of_particles)
+    // {
+    //     std::cout << "wrong number of particles - nb particles (min) " << nb_particles_min << "  (max) "
+    //               << nb_particles_max << " (expected) " << number_of_particles << std::endl;
+
+    //     //  if(para.io_master())
+    //     //      std::cout << "Save Tree in parallel\n";
+    //     //  // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
+    //     //  std::string outName("tree_count_mpi.bin");
+    //     //  std::string header("CHEBYSHEV LOW RANK ");
+    //     //  scalfmm::tools::io::save(para, outName, letTree, header);
+    // }
+    // else
+    // {
+    //     std::cout << "Found the right number of particles - nb particles " << number_of_particles << std::endl;
+    // }
+
+    // for(int level = letTree.height() - 1; level >= 2; --level)
+    // {
+    //     std::cout << "\n  --  level " << level << "   --  " << std::endl;
+    //     scalfmm::component::for_each_mine_leaf(letTree.begin_mine_cells(level), letTree.end_mine_cells(level),
+    //                                            [](auto const& cell)
+    //                                            {
+    //                                                std::cout << "cell index " << cell.index() << "  multipoles "
+    //                                                          << cell.multipoles().at(0) << "  locals "
+    //                                                          << cell.locals().at(0) << std::endl;
+    //                                            });
+    // }
+    int res{int(right_number_of_particles)};
+
+    para.get_communicator().reduce(&res, 1, MPI_INT, MPI_LAND, 0);
+
+    if(para.io_master())
     {
-        std::cout << "wrong number of particles - nb particles (min) " << nb_particles_min << "  (max) "
-                  << nb_particles_max << " (expected) " << number_of_particles << std::endl;
-
-        if(para.io_master())
-            std::cout << "Save Tree in parallel\n";
-        // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
-        std::string outName("saveTreeLet.bin");
-        std::string header("CHEBYSHEV LOW RANK ");
-        scalfmm::tools::io::save(para, outName, letTree, header);
+        if(bool(res))
+        {
+            std::cout << cpp_tools::colors::blue << "Right number of particles  " << std::boolalpha << bool(res)
+                      << cpp_tools::colors::reset << std::endl;
+        }
+        else
+        {
+            std::cout << cpp_tools::colors::red << "Wrong number of particles  " << std::boolalpha << bool(res)
+                      << cpp_tools::colors::reset << std::endl;
+        }
     }
-
     return 0;
 }
 
@@ -409,19 +443,21 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
         use_particle_distribution = false;
     }
 
+    int dimension = parser.get<local_args::dimension>();
+    //  OpenMP
+    const std::size_t nb_threads{parser.get<args::thread_count>()};
+    omp_set_dynamic(0);
+    omp_set_num_threads(nb_threads);
     if(para.io_master())
     {
-        std::cout << cpp_tools::colors::blue << "<params> Tree height : " << tree_height << cpp_tools::colors::reset
+        std::cout << cpp_tools::colors::blue << "<params> Tree height: " << tree_height << cpp_tools::colors::reset
                   << '\n';
 
-        std::cout << cpp_tools::colors::blue << "<params> Group Size : " << group_size << cpp_tools::colors::reset
+        std::cout << cpp_tools::colors::blue << "<params> Group Size:  " << group_size << cpp_tools::colors::reset
+                  << '\n';
+        std::cout << cpp_tools::colors::blue << "<params> Threads num: " << nb_threads << cpp_tools::colors::reset
                   << '\n';
     }
-    int dimension = parser.get<local_args::dimension>();
-    //  OpenMP
-    const std::size_t nb_threads{parser.get<args::thread_count>()};
-    omp_set_dynamic(0);
-    omp_set_num_threads(nb_threads);
     //
     const bool readFile(parser.exists<local_args::read_file>());
     std::string input_file;
@@ -459,20 +495,19 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
                       << '\n';
         }
     }
-    /*
+    //
     if(dimension == 1)
     {
         run<1>(para, tree_height, group_size, pbc, nb_level_above_root, readFile, input_file, interaction,
                use_leaf_distribution, use_particle_distribution);
     }
     else if(dimension == 2)
-    */
     {
         run<2>(para, tree_height, group_size, pbc, nb_level_above_root, readFile, input_file, interaction,
                use_leaf_distribution, use_particle_distribution);
     }
-    /*
-      else if(dimension == 3)
+
+    else if(dimension == 3)
     {
         run<3>(para, tree_height, group_size, pbc, nb_level_above_root, readFile, input_file, interaction,
                use_leaf_distribution, use_particle_distribution);
@@ -482,9 +517,9 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
         run<4>(para, tree_height, group_size, pbc, nb_level_above_root, readFile, input_file, interaction,
                use_leaf_distribution, use_particle_distribution);
     }
-    */
+
     //
     std::cout << std::flush;
     para.get_communicator().barrier();
     para.end();
-}
+}
\ No newline at end of file
diff --git a/checks/test_build_let.cpp b/checks/test_build_let.cpp
index 6d05c7e9da9671546864d3078fc5c1fcf6e95e55..5a4e68e62a730915c68b7dd876a9b653e30e7a6d 100644
--- a/checks/test_build_let.cpp
+++ b/checks/test_build_let.cpp
@@ -57,18 +57,18 @@ namespace local_args
         cpp_tools::cl_parser::str_vec flags = {"--dist-part"};
         std::string description = "Use the particle distribution to distribute the tree";
     };
-    struct PartLeafDistrib
-    {
-        /// Unused type, mandatory per interface specification
-        using type = bool;
-        /// The parameter is a flag, it doesn't expect a following value
-        enum
-        {
-            flagged
-        };
-        cpp_tools::cl_parser::str_vec flags = {"--dist-part-leaf"};
-        std::string description = "Use two distribution one for the particle and one for the tree";
-    };
+    // struct PartLeafDistrib
+    // {
+    //     /// Unused type, mandatory per interface specification
+    //     using type = bool;
+    //     /// The parameter is a flag, it doesn't expect a following value
+    //     enum
+    //     {
+    //         flagged
+    //     };
+    //     cpp_tools::cl_parser::str_vec flags = {"--dist-part-leaf"};
+    //     std::string description = "Use two distribution one for the particle and one for the tree";
+    // };
 }   // namespace local_args
 template<int dimension>
 auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string& input_file,
@@ -172,13 +172,32 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
     // bool const mutual = false;                      // fmm_operator.near_field().mutual();
     // scalfmm::list::sequential::build_interaction_lists(letGroupTree, letGroupTree, separation_criterion, mutual);
 
-    // scalfmm::utils::trace(std::cout, letGroupTree, 1);
+    scalfmm::io::trace(std::cout, letGroupTree, 2);
 
-    ///
-    ///////////////////////////////////////////////////////////////////////////////////////////////////////
-    ///
-    ///////////////////////////////////////////////////////////////////////////////////////////////////////
-    ///   Save the data
+///
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+///
+///////////////////////////////////////////////////////////////////////////////////////////////////////
+///   Save the data
+#ifdef SCALFMM_DEBUG_MPI
+    {
+        const int rank = para.get_process_id();
+        std::string outName1("tree_rank_" + std::to_string(rank) + ".bin");
+        std::string header1("LOCAL TREE ");
+        scalfmm::tools::io::save(outName1, letGroupTree, header1);
+        const int nbDataPerRecord = scalfmm::container::particle_traits<particle_type>::number_of_elements;
+        const int inputs_size = scalfmm::container::particle_traits<particle_type>::inputs_size;
+        const bool verbose_write = false;   // True only for the master
+        std::string outName2("particles_rank_" + std::to_string(rank) + ".fma");
+
+        scalfmm::io::FFmaGenericWriter<value_type> writer_seq(outName2, verbose_write);
+        // Get the number of particles
+        auto number_of_particles = letGroupTree.number_particles();
+        std::clog << "rank[" + std::to_string(rank) + "] number_of_particles " << number_of_particles << std::endl;
+        ///
+        writer_seq.writeDataFromTree(letGroupTree, number_of_particles);
+    }
+#endif
     // const int nbDataPerRecord = scalfmm::container::particle_traits<particle_type>::number_of_elements;
     // const int inputs_size = scalfmm::container::particle_traits<particle_type>::inputs_size;
 
@@ -215,7 +234,7 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     auto parser = cpp_tools::cl_parser::make_parser(
       cpp_tools::cl_parser::help{}, args::input_file(), args::output_file(), args::tree_height{}, args::order{},
       args::thread_count{}, args::block_size{}, args::Dimension{}, local_args::PartDistrib{},
-      local_args::PartLeafDistrib{}, local_args::LevelShared{});
+      /*local_args::PartLeafDistrib{},*/ local_args::LevelShared{});
     parser.parse(argc, argv);
     // Getting command line parameters
     const int tree_height{parser.get<args::tree_height>()};
@@ -229,11 +248,11 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
 
     bool use_particle_distribution{parser.exists<local_args::PartDistrib>()};
     bool use_leaf_distribution{!use_particle_distribution};
-    if(parser.exists<local_args::PartLeafDistrib>())
-    {
-        use_leaf_distribution = true;
-        use_particle_distribution = true;
-    }
+    // if(parser.exists<local_args::PartLeafDistrib>())
+    // {
+    //     use_leaf_distribution = true;
+    //     use_particle_distribution = true;
+    // }
 
     if(para.io_master())
     {
diff --git a/checks/test_build_tree.cpp b/checks/test_build_tree.cpp
index cf97d76a903548b0ba3fc7285ba11995ecc351c8..29cf5218d7afbb452dbf42df5a47fefb75f4832d 100644
--- a/checks/test_build_tree.cpp
+++ b/checks/test_build_tree.cpp
@@ -1,4 +1,6 @@
-#include <array>
+// @FUSE_OMP
+
+#include <array>
 #include <chrono>
 #include <thread>
 
@@ -6,12 +8,12 @@
 #include "scalfmm/interpolation/interpolation.hpp"
 #include "scalfmm/lists/sequential.hpp"
 #include "scalfmm/matrix_kernels/laplace.hpp"
-#include "scalfmm/tools/fma_dist_loader.hpp"
+// #include "scalfmm/tools/fma_dist_loader.hpp"
 #include "scalfmm/tools/fma_loader.hpp"
 #include "scalfmm/tools/tree_io.hpp"
 #include "scalfmm/tree/box.hpp"
 #include "scalfmm/tree/cell.hpp"
-// #include "scalfmm/tree/group_let.hpp"
+//
 #include "scalfmm/tree/group_tree_view.hpp"
 #include "scalfmm/tree/io.hpp"
 #include "scalfmm/tree/leaf_view.hpp"
@@ -20,7 +22,7 @@
 
 #include <cpp_tools/cl_parser/tcli.hpp>
 #include <cpp_tools/colors/colorized.hpp>
-#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+// #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 ///
 /// \brief main
 /// \param argv
@@ -55,9 +57,8 @@ namespace local_args
 
 }   // namespace local_args
 template<int dimension>
-auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string& input_file,
-         const std::string& output_file, const int tree_height, const int& part_group_size, const int& leaf_group_size,
-         const int order) -> int
+auto run(const std::string& input_file, const std::string& output_file, const int tree_height,
+         const int& part_group_size, const int& leaf_group_size, const int order) -> int
 {
     constexpr int nb_inputs_near = 1;
     constexpr int nb_outputs_near = 1;
@@ -87,8 +88,8 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
     ///
     ///   1) read constants of the problem in file;
     ///   2) each processor read N/P particles
-
-    scalfmm::io::DistFmaGenericLoader<value_type, dimension> loader(input_file, para, para.io_master());
+    bool verbose = true;
+    scalfmm::io::FFmaGenericLoader<value_type, dimension> loader(input_file, verbose);
     //
     const int local_number_of_particles = loader.getMyNumberOfParticles();
     value_type width = loader.getBoxWidth();
@@ -194,10 +195,10 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
 
 auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
 {
-    cpp_tools::parallel_manager::parallel_manager para;
-    para.init();
-    std::cout << std::boolalpha << "para.io_master() " << para.io_master() << " get_process_id() "
-              << para.get_process_id() << std::endl;
+    // cpp_tools::parallel_manager::parallel_manager para;
+    // para.init();
+    // std::cout << std::boolalpha << "para.io_master() " << para.io_master() << " get_process_id() "
+    //           << para.get_process_id() << std::endl;
     //
     // Parameter handling
     auto parser = cpp_tools::cl_parser::make_parser(cpp_tools::cl_parser::help{}, args::input_file(),
@@ -214,7 +215,7 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     const auto order{parser.get<args::order>()};
     const auto dimension{parser.get<args::Dimension>()};
 
-    if(para.io_master())
+    // if(para.io_master())
     {
         std::cout << cpp_tools::colors::blue << "<params> Tree height: " << tree_height << cpp_tools::colors::reset
                   << '\n';
@@ -235,14 +236,14 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     {
         constexpr int dim = 2;
 
-        run<dim>(para, input_file, output_file, tree_height, group_size, group_size, order);
+        run<dim>(input_file, output_file, tree_height, group_size, group_size, order);
         break;
     }
     // case 3:
     // {
     //     constexpr int dim = 3;
 
-    //     run<dim>(para, input_file, output_file, tree_height, group_size, group_size, order);
+    //     run<dim>( input_file, output_file, tree_height, group_size, group_size, order);
     //     break;
     // }
     default:
@@ -251,5 +252,5 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     }
     }
 
-    para.end();
+    // para.end();
 }
diff --git a/cmake/dependencies/openmp.cmake b/cmake/dependencies/openmp.cmake
index e73fa3d3b94ee55414feb7ca9dff1cfbc71f23de..5d4a7d7ae14698605bb5905bc460081146c3c78c 100644
--- a/cmake/dependencies/openmp.cmake
+++ b/cmake/dependencies/openmp.cmake
@@ -1,21 +1,36 @@
 #
 # OpenMP
 # ------
+if(CMAKE_CXX_COMPILER_ID MATCHES "AppleClang")
+    # https://cliutils.gitlab.io/modern-cmake/chapters/packages/OpenMP.html
+    # build the target
+    find_file(OMP_H omp.h ENV HOMEBREW_CELLAR PATH_SUFFIXES "libomp/18.1.6/include")
+    cmake_print_variables(OMP_H)
+    add_library(OpenMP::OpenMP_CXX IMPORTED INTERFACE)
+    set_property(TARGET OpenMP::OpenMP_CXX
+        PROPERTY INTERFACE_COMPILE_OPTIONS "-Xclang -fopenm")
+    set_property(TARGET OpenMP::OpenMP_CXX
+        PROPERTY INTERFACE_INCLUDE_DIRECTORIES "/usr/local/Cellar/libomp/18.1.6/include")
+    set_property(TARGET OpenMP::OpenMP_CXX
+        PROPERTY INTERFACE_COMPILE_DEFINITIONS "_OPENMP")
+    set_property(TARGET OpenMP::OpenMP_CXX
+        PROPERTY INTERFACE_LINK_DIRECTORIES "/usr/local/Cellar/libomp/18.1.6/lib")
 
-find_package(OpenMP REQUIRED)
+    # Only works if the same flag is passed to the linker; use CMake 3.9+ otherwise (Intel, AppleClang)
+    set_property(TARGET OpenMP::OpenMP_CXX
+        PROPERTY INTERFACE_LINK_LIBRARIES -lomp)
+    set(OpenMP_CXX_FOUND ON)
+else()
+    find_package(OpenMP REQUIRED)
+endif()
 
 if(OpenMP_CXX_FOUND)
     list(APPEND OMP_TARGET OpenMP::OpenMP_CXX cpp_tools::parallel_manager)
     list(APPEND OMP_COMPILE_DEFINITIONS CPP_TOOLS_PARALLEL_MANAGER_USE_OMP)
     list(APPEND FUSE_LIST OMP)
-    # cmake_print_variables(CMAKE_CXX_COMPILER_ID)
-    # cmake_print_variables(CMAKE_CXX_COMPILER_VERSION)
-    # cmake_print_variables(CMAKE_CXX_COMPILER_VERSION_INTERNAL)
-# if(CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-# if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER "16.0")
-# set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -lomp")
-# endif()
-# endif()
+
 else(OpenMP_CXX_FOUND)
-    message(WARNING "OPENMP NOT FOUND")
+    message(WARNING " OPENMP NOT FOUND ")
 endif(OpenMP_CXX_FOUND)
+
+cmake_print_variables(FUSE_LIST)
\ No newline at end of file
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
index a4291fe51e8e9c4265ef5fef970054fb8c62e74c..9cdc274efdd326d031375d84479a58df7a0e431e 100644
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -3,32 +3,27 @@
 # List of source files
 set(source_tests_files
 
+    tutorial.cpp
+
     # test
     test_particles.cpp
     test_dimension.cpp
     test_dimension_low_rank.cpp
     test_dimension_omp.cpp
 
+    fmm_source_target.cpp
+
     # FMM
     test_laplace_kernels.cpp
     test_like_mrhs.cpp
 
-    # # debug & check
-    # count_particles_seq.cpp
-    # count_particles_st_seq.cpp
-    # count_particles_omp.cpp
-    # count_particles_st_omp.cpp
-
+    # debug & check
     # test_time_loop.cpp
-    # test to move in compose/sandox project
-    fmm_source_target.cpp
-    tutorial.cpp
-
-    # Test accuracy (barycentric interpolation)
-    test_accuracy.cpp
 )
+cmake_print_variables({CMAKE_PROJECT_NAME}_USE_MPI)
 
 if(${CMAKE_PROJECT_NAME}_USE_MPI)
+
     list(APPEND source_tests_files
         test_mpi_algo.cpp
     )
diff --git a/examples/test_mpi_algo.cpp b/examples/test_mpi_algo.cpp
index c37c4e1239c9f1284d3dd1f5208af16be0e35291..9501fb2cfb564246f50a2fd58b18d19a4875c82c 100644
--- a/examples/test_mpi_algo.cpp
+++ b/examples/test_mpi_algo.cpp
@@ -4,10 +4,9 @@
 
 #include "scalfmm/container/particle.hpp"
 #include "scalfmm/interpolation/interpolation.hpp"
-#include "scalfmm/lists/sequential.hpp"
+#include "scalfmm/lists/lists.hpp"
 #include "scalfmm/matrix_kernels/laplace.hpp"
 #include "scalfmm/tools/fma_dist_loader.hpp"
-#include "scalfmm/tools/fma_loader.hpp"
 #include "scalfmm/tools/tree_io.hpp"
 #include "scalfmm/tree/box.hpp"
 #include "scalfmm/tree/cell.hpp"
@@ -31,7 +30,7 @@
 /// \code
 ///   mpirun -output-filename log --oversubscribe -np 3 ./examples/Release/test_mpi_algo
 ///   --input-file ../data/debug/circle2d_r3.fma --order 3 --tree-height 3
-///   --group-size 3 -d 2
+///    --group-size 3 -d 2
 /// \endcode
 namespace local_args
 {
@@ -45,7 +44,7 @@ namespace local_args
     };
     struct PartDistrib
     {
-        /// Unused type, m|atory per interface specification
+        /// Unused type, matory per interface specification
         using type = bool;
         /// The parameter is a flag, it doesn't expect a following value
         enum
@@ -55,18 +54,18 @@ namespace local_args
         cpp_tools::cl_parser::str_vec flags = {"--dist-part"};
         std::string description = "Use the particle distribution to distribute the tree";
     };
-    struct PartLeafDistrib
-    {
-        /// Unused type, m|atory per interface specification
-        using type = bool;
-        /// The parameter is a flag, it doesn't expect a following value
-        enum
-        {
-            flagged
-        };
-        cpp_tools::cl_parser::str_vec flags = {"--dist-part-leaf"};
-        std::string description = "Use two distribution one for the particle | one for the tree";
-    };
+    // struct PartLeafDistrib
+    // {
+    //     /// Unused type, m|atory per interface specification
+    //     using type = bool;
+    //     /// The parameter is a flag, it doesn't expect a following value
+    //     enum
+    //     {
+    //         flagged
+    //     };
+    //     cpp_tools::cl_parser::str_vec flags = {"--dist-part-leaf"};
+    //     std::string description = "Use two distribution one for the particle | one for the tree";
+    // };
 }   // namespace local_args
 
 using value_type = double;
@@ -100,7 +99,6 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
     using leaf_type = scalfmm::component::leaf_view<particle_type>;
     using box_type = scalfmm::component::box<position_type>;
     using group_tree_type = scalfmm::component::dist_group_tree<cell_type, leaf_type, box_type>;
-
     ///
     ///////////////////////////////////////////////////////////////////////////////////////////////////////
     ///   Read the data in parallel
@@ -139,6 +137,18 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
         particles_set[idx] = p;
         // std::cout << p << std::endl;
     }
+    ///////////////////
+    far_matrix_kernel_type mk_far{};
+    const bool mutual_near = false;
+
+    near_matrix_kernel_type mk_near{};
+    interpolator_type interpolator(mk_far, order, static_cast<std::size_t>(tree_height), box.width(0));
+    typename FMM_OPERATOR_TYPE::near_field_type near_field(mk_near, mutual_near);
+    typename FMM_OPERATOR_TYPE::far_field_type far_field(interpolator);
+    FMM_OPERATOR_TYPE fmm_operator(near_field, far_field);
+    // Build interaction lists
+    int const& separation_criterion = fmm_operator.near_field().separation_criterion();
+    bool const& mutual = fmm_operator.near_field().mutual();
     ///
     ///////////////////////////////////////////////////////////////////////////////////////////////////////
     // check
@@ -152,11 +162,10 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
     ///  2) construct the tree, then the let
     ///
     // separation criteria used to construct M2L | P2P ghosts
-    int separation = 1;
     // Construct the LET
     auto letGroupTree = scalfmm::tree::let::buildLetTree<group_tree_type>(
       para, number_of_particles, particles_set, box, leaf_level, level_shared, part_group_size, leaf_group_size, order,
-      separation, use_leaf_distribution, use_particle_distribution);
+      separation_criterion, use_leaf_distribution, use_particle_distribution);
 
     if(para.io_master())
     {
@@ -165,58 +174,95 @@ auto run(cpp_tools::parallel_manager::parallel_manager& para, const std::string&
         std::cout << cpp_tools::colors::reset;
     }
 
-    //
-    // Build interaction lists
-    int const& separation_criterion = separation;   // fmm_operator.near_field().separation_criterion();
-    bool const mutual = false;                      // fmm_operator.near_field().mutual();
     scalfmm::list::sequential::build_interaction_lists(letGroupTree, letGroupTree, separation_criterion, mutual);
 
-    scalfmm::io::trace(std::cout, letGroupTree, 2);
+#ifdef SCALFMM_DEBUG_MPI
+    {
+        const int rank = para.get_process_id();
+        std::string outName0("tree_group_rank_" + std::to_string(rank) + ".txt");
+        std::ofstream out(outName0);
+        scalfmm::io::trace(out, letGroupTree, 2);
+        std::string outName1("tree_rank_" + std::to_string(rank) + ".bin");
+        std::string header1("LOCAL TREE ");
+        scalfmm::tools::io::save(outName1, letGroupTree, header1);
+        //
+        const int nbDataPerRecord = scalfmm::container::particle_traits<particle_type>::number_of_elements;
+        const int inputs_size = scalfmm::container::particle_traits<particle_type>::inputs_size;
+        const bool verbose_write = true;   // True only for the master
+        std::string outName2("particles_rank_" + std::to_string(rank) + ".fma");
 
-    far_matrix_kernel_type mk_far{};
-    interpolator_type interpolator(mk_far, order, static_cast<std::size_t>(tree_height), box.width(0));
-    near_matrix_kernel_type mk_near{};
-    typename FMM_OPERATOR_TYPE::near_field_type near_field(mk_near);
-    typename FMM_OPERATOR_TYPE::far_field_type far_field(interpolator);
-    FMM_OPERATOR_TYPE fmm_operator(near_field, far_field);
+        scalfmm::io::FFmaGenericWriter<value_type> writer_seq(outName2, verbose_write);
+        // Get the number of particles
+        auto number_of_particles = letGroupTree.number_particles();
+        std::clog << "number_of_particles " << number_of_particles << std::endl;
+        ///
+        writer_seq.writeDataFromTree(letGroupTree, number_of_particles);
+    }
+
+#endif
+    if(para.io_master())
+    {
+        std::cout << cpp_tools::colors::blue << "Fmm with kernels: " << std::endl
+                  << "       near " << mk_near.name() << std::endl
+                  << "       far  " << mk_far.name() << std::endl
+                  << cpp_tools::colors::reset;
+    }
     //
-    auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
-    // auto operator_to_proceed =
-    //   scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2m |
-    //   scalfmm::algorithms::operators_to_proceed::m2l | scalfmm::algorithms::operators_to_proceed::l2l |
+    auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::all;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::nearfield;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::m2l;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::p2m |
+    //                            scalfmm::algorithms::operators_to_proceed::m2m |
+    //                            scalfmm::algorithms::operators_to_proceed::m2l;
+    //   | scalfmm::algorithms::operators_to_proceed::p2p |
     //   scalfmm::algorithms::operators_to_proceed::l2p;
+    //
 
     // scalfmm::algorithms::fmm[scalfmm::options::_s(scalfmm::options::omp)](tree, fmm_operator, operator_to_proceed);
-    std::cout << " call algo  scalfmm::algorithms::mpi::proc_task\n ";
+    std::cout << cpp_tools::colors::blue << "operator_to_proceed: ";
+    scalfmm::algorithms::print(operator_to_proceed);
+    std::cout << cpp_tools::colors::reset << std::endl;
+
     scalfmm::algorithms::mpi::proc_task(letGroupTree, fmm_operator, operator_to_proceed);
-    std::cout << " end  algo  scalfmm::algorithms::mpi::proc_task\n ";
 
+    // std::clog << "End scalfmm::algorithms::mpi::proc_task\n";
     ///
     ///////////////////////////////////////////////////////////////////////////////////////////////////////
     ///
     ///////////////////////////////////////////////////////////////////////////////////////////////////////
     ///   Save the data
-    // const int nbDataPerRecord = scalfmm::container::particle_traits<particle_type>::number_of_elements;
-    // const int inputs_size = scalfmm::container::particle_traits<particle_type>::inputs_size;
+    if(!output_file.empty())
+    {
+        const int nbDataPerRecord = scalfmm::container::particle_traits<particle_type>::number_of_elements;
+        const int inputs_size = scalfmm::container::particle_traits<particle_type>::inputs_size;
+        const bool verbose_write = para.io_master();   // True only for the master
+        scalfmm::io::DistFmaGenericWriter<value_type> writer(output_file, para, verbose_write);
+        /// Get the number of particles
+        // std::cout << "number_of_particles " << number_of_particles << std::endl;
+        ///
+        writer.writeHeader(centre, width, number_of_particles, sizeof(value_type), nbDataPerRecord, dimension,
+                           inputs_size);
+        ///
+        writer.writeFromTree(letGroupTree, number_of_particles);
+        // std::cout << "End writing\n" << std::flush;
+    }
 
-    // // static constexpr std::size_t nbDataPerRecord = particle_type::number_of_elements;
-    // scalfmm::tools::DistFmaGenericWriter<value_type> writer(output_file, para);
-    // /// Get the number of particles
-    // std::cout << "number_of_particles " << number_of_particles << std::endl;
-    // ///
-    // writer.writeHeader(centre, width, number_of_particles, sizeof(value_type), nbDataPerRecord, dimension,
-    // inputs_size);
-    // ///
-    // writer.writeFromTree(letGroupTree, number_of_particles);
     // ///
-    // ///////////////////////////////////////////////////////////////////////////////////////////////////////
-    if(para.io_master())
-        std::cout << "Save Tree in parallel\n";
-    // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
-    std::string outName("saveTreeLet.bin");
-    std::string header("CHEBYSHEV LOW RANK ");
-    scalfmm::tools::io::save(para, outName, letGroupTree, header);
+    // /////////////////////////////////////////////////////////////////////////////////////////////////////**
+    // #ifdef SCALFMM_DEBUG_MPI_1
+    //     if(para.io_master())
+    //         std::cout << "Save Tree in parallel\n";
+    //     // // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
+    //     std::string outName("tree_let.bin");
+    //     std::string header("CHEBYSHEV LOW RANK ");
+    //     scalfmm::tools::io::save(para, outName, letGroupTree, header);
 
+    //     const int rank = para.get_process_id();
+    //     std::string outName1("tree_rank_" + std::to_string(rank) + ".bin");
+    //     std::string header1("CHEBYSHEV LOW RANK ");
+    //     scalfmm::tools::io::save(outName1, letGroupTree, header1);
+    // #endif
     return 0;
 }
 
@@ -231,7 +277,7 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     auto parser = cpp_tools::cl_parser::make_parser(
       cpp_tools::cl_parser::help{}, args::input_file(), args::output_file(), args::tree_height{}, args::order{},
       args::thread_count{}, args::block_size{}, args::Dimension{}, local_args::PartDistrib{},
-      local_args::PartLeafDistrib{}, local_args::LevelShared{});
+      /*local_args::PartLeafDistrib{},*/ local_args::LevelShared{});
     parser.parse(argc, argv);
     // Getting comm| line parameters
     const int tree_height{parser.get<args::tree_height>()};
@@ -242,14 +288,15 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     const auto output_file{parser.get<args::output_file>()};
     const auto order{parser.get<args::order>()};
     const auto dimension{parser.get<args::Dimension>()};
+    const std::size_t nb_threads{parser.get<args::thread_count>()};
 
     bool use_particle_distribution{parser.exists<local_args::PartDistrib>()};
     bool use_leaf_distribution{!use_particle_distribution};
-    if(parser.exists<local_args::PartLeafDistrib>())
-    {
-        use_leaf_distribution = true;
-        use_particle_distribution = true;
-    }
+    // if(parser.exists<local_args::PartLeafDistrib>())
+    // {
+    //     use_leaf_distribution = true;
+    //     use_particle_distribution = false;
+    // }
 
     if(para.io_master())
     {
@@ -258,13 +305,21 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
         std::cout << cpp_tools::colors::blue << "<params> Group Size:  " << group_size << cpp_tools::colors::reset
                   << '\n';
         std::cout << cpp_tools::colors::blue << "<params> order:       " << order << cpp_tools::colors::reset << '\n';
+        std::cout << cpp_tools::colors::blue << "<params> Proc num:    " << para.get_num_processes()
+                  << cpp_tools::colors::reset << '\n';
+        std::cout << cpp_tools::colors::blue << "<params> Threads num: " << nb_threads << cpp_tools::colors::reset
+                  << '\n';
         if(!input_file.empty())
         {
             std::cout << cpp_tools::colors::blue << "<params> Input file:  " << input_file << cpp_tools::colors::reset
                       << '\n';
         }
-        std::cout << cpp_tools::colors::blue << "<params> Output file: " << output_file << cpp_tools::colors::reset
-                  << '\n';
+        if(!output_file.empty())
+        {
+            std::cout << cpp_tools::colors::blue << "<params> Output file:  " << output_file << cpp_tools::colors::reset
+                      << '\n';
+        }
+
         std::cout << cpp_tools::colors::blue << "<params> Particle Distribution: " << std::boolalpha
                   << use_particle_distribution << cpp_tools::colors::reset << '\n';
         std::cout << cpp_tools::colors::blue << "<params> Leaf Distribution:     " << std::boolalpha
@@ -292,26 +347,28 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
                            use_leaf_distribution, use_particle_distribution);
         break;
     }
-    // case 3:
-    // {
-    //     constexpr int dim = 3;
-    //     using interpolation_type = interpolator_alias<value_type, dim, matrix_kernel_type, options_chebyshev>;
-    //     using far_field_type = scalfmm::operators::far_field_operator<interpolation_type>;
+    case 3:
+    {
+        constexpr int dim = 3;
+        using interpolation_type = interpolator_alias<value_type, dim, matrix_kernel_type, options_chebyshev>;
+        using far_field_type = scalfmm::operators::far_field_operator<interpolation_type>;
 
-    //     using ffm_type = scalfmm::operators::fmm_operators<near_field_type, far_field_type>;
+        using ffm_type = scalfmm::operators::fmm_operators<near_field_type, far_field_type>;
 
-    //     run<dim, ffm_type>(para, input_file, output_file, tree_height, group_size, group_size, order, level_shared,
-    //                        use_leaf_distribution, use_particle_distribution);
-    //     break;
-    // }
+        run<dim, ffm_type>(para, input_file, output_file, tree_height, group_size, group_size, order, level_shared,
+                           use_leaf_distribution, use_particle_distribution);
+        break;
+    }
     default:
     {
         std::cerr << "Dimension should be only 2 or 3 !!\n";
     }
     }
 #ifdef SCALFMM_USE_MPI
-    std::cout << std::flush;
+    std::cout << " barrier() \n" << std::flush;
     para.get_communicator().barrier();
 #endif
+    std::cout << " end() \n" << std::flush;
+
     para.end();
 }
diff --git a/guix-tools/python-exhale.scm b/guix-tools/python-exhale.scm
new file mode 100644
index 0000000000000000000000000000000000000000..387a9017fd3e8304431318f1a1efdf2295410ef8
--- /dev/null
+++ b/guix-tools/python-exhale.scm
@@ -0,0 +1,57 @@
+(define-module (python-exhale)
+  #:use-module (guix)
+  #:use-module (guix packages)
+  #:use-module (guix download)
+  #:use-module (guix git-download)
+  #:use-module (guix hg-download)
+  #:use-module (guix gexp)
+  #:use-module (guix utils)
+  #:use-module (guix build-system python)
+  #:use-module (guix build-system pyproject)
+  #:use-module ((guix licenses) #:prefix license:)
+  #:use-module (gnu packages)
+  #:use-module (gnu packages certs)
+  #:use-module (gnu packages check)
+  #:use-module (gnu packages fonts)
+  #:use-module (gnu packages fontutils)
+  #:use-module (gnu packages graphviz)
+  #:use-module (gnu packages image)
+  #:use-module (gnu packages imagemagick)
+  #:use-module (gnu packages jupyter)
+  #:use-module (gnu packages python)
+  #:use-module (gnu packages sphinx)
+  #:use-module (gnu packages xml)
+  #:use-module (gnu packages python-build)
+  #:use-module (gnu packages python-check)
+  #:use-module (gnu packages python-crypto)
+  #:use-module (gnu packages python-web)
+  #:use-module (gnu packages python-xyz)
+  #:use-module (gnu packages time)
+  #:use-module (gnu packages python-science)
+  #:use-module (gnu packages graph)
+  #:use-module ((guix licenses) #:prefix license:)
+  #:use-module (gnu packages)
+  #:use-module (guix build-system gnu))
+
+(define-public python-exhale
+  (package
+    (name "python-exhale")
+    (version "0.3.7")
+    (source
+     (origin
+       (method url-fetch)
+       (uri (pypi-uri "exhale" version))
+       (sha256
+        (base32 "1n5hsrg7swh535bd5b3f55ldcb343yld849kjcfm2mlllp89cakm"))))
+    (build-system pyproject-build-system)
+    (propagated-inputs (list python-beautifulsoup4 python-breathe python-lxml
+                             python-six python-sphinx))
+    (native-inputs (list python-setuptools python-wheel))
+    (home-page "https://github.com/svenevs/exhale")
+    (synopsis
+     "Automatic C++ library API documentation generator using Doxygen, Sphinx, and")
+    (description
+     "Automatic C++ library API documentation generator using Doxygen, Sphinx, and.")
+    (license #f)))
+
+;; python-exhale
diff --git a/guix-tools/scalfmm-manifest-clang-mkl.scm b/guix-tools/scalfmm-manifest-clang-mkl.scm
index 285a3fce6e76c628f9f4ec9b6a4924a91d8d9de1..3dbff5ada11e18cf3d1a85768f66243e06740a2e 100644
--- a/guix-tools/scalfmm-manifest-clang-mkl.scm
+++ b/guix-tools/scalfmm-manifest-clang-mkl.scm
@@ -10,5 +10,6 @@
         "ncurses"
         "intel-oneapi-mkl"
         "grep"
+	"openmpi"
         "findutils"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-clang-openblas.scm b/guix-tools/scalfmm-manifest-clang-openblas.scm
index 7dbe2c842d78332a5499a9eb2f5693717a6d34d2..dd5b7002628f647a91110c4aa3435b2b2e4fbad3 100644
--- a/guix-tools/scalfmm-manifest-clang-openblas.scm
+++ b/guix-tools/scalfmm-manifest-clang-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-doc.scm b/guix-tools/scalfmm-manifest-doc.scm
new file mode 100644
index 0000000000000000000000000000000000000000..5b84cd5d632593fcdfa2068c2e9af08590caf95f
--- /dev/null
+++ b/guix-tools/scalfmm-manifest-doc.scm
@@ -0,0 +1,20 @@
+;; What follows is a "manifest" equivalent to the command line you gave.
+;; You can store it in a file that you may then pass to any 'guix' command
+;; that accepts a '--manifest' (or '-m') option.
+
+(specifications->manifest
+  (list "openblas"
+        "fftw"
+        "fftwf"
+        "cmake"
+        "make"
+        "gcc-toolchain"
+        "pkg-config"
+        "doxygen"
+        "coreutils"
+        "python"
+        "python-sphinx"
+        "python-recommonmark"
+        "python-breathe"
+        "python-sphinx-rtd-theme"
+        "python-exhale"))
diff --git a/guix-tools/scalfmm-manifest-gcc-mkl.scm b/guix-tools/scalfmm-manifest-gcc-mkl.scm
index 016062e39a3a9ce9ad384e5d93bbffa546fef988..c191baf58e1995ba21d85192aad633f7d9e585f0 100644
--- a/guix-tools/scalfmm-manifest-gcc-mkl.scm
+++ b/guix-tools/scalfmm-manifest-gcc-mkl.scm
@@ -11,4 +11,5 @@
         "intel-oneapi-mkl"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-gcc-openblas.scm b/guix-tools/scalfmm-manifest-gcc-openblas.scm
index 3db24721bb79849d63f9ac7dee6e74e53c7bfe25..1619176d1008a1ad06353413ed26579ced2350b3 100644
--- a/guix-tools/scalfmm-manifest-gcc-openblas.scm
+++ b/guix-tools/scalfmm-manifest-gcc-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-gcc11-openblas.scm b/guix-tools/scalfmm-manifest-gcc11-openblas.scm
index 3db24721bb79849d63f9ac7dee6e74e53c7bfe25..1619176d1008a1ad06353413ed26579ced2350b3 100644
--- a/guix-tools/scalfmm-manifest-gcc11-openblas.scm
+++ b/guix-tools/scalfmm-manifest-gcc11-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-gcc12-openblas.scm b/guix-tools/scalfmm-manifest-gcc12-openblas.scm
index a7291f952ef762d63d4874a684aac2a8b2bbe00f..37f52784f54d4f89a839aa8f830c89d8351d62a5 100644
--- a/guix-tools/scalfmm-manifest-gcc12-openblas.scm
+++ b/guix-tools/scalfmm-manifest-gcc12-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-gcc13-openblas.scm b/guix-tools/scalfmm-manifest-gcc13-openblas.scm
index a6a02f26d7ed0cd31fffe7c93eb2588d31f45eb9..d6d2f99c3817b4a8922ef23498cb758ae1827337 100644
--- a/guix-tools/scalfmm-manifest-gcc13-openblas.scm
+++ b/guix-tools/scalfmm-manifest-gcc13-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/guix-tools/scalfmm-manifest-gcc14-openblas.scm b/guix-tools/scalfmm-manifest-gcc14-openblas.scm
index cb5b2d1f3c2c01e0f69739ab7843e256b1ad7f1c..2fc252c60a377b1b03bfe58106678be20e41e051 100644
--- a/guix-tools/scalfmm-manifest-gcc14-openblas.scm
+++ b/guix-tools/scalfmm-manifest-gcc14-openblas.scm
@@ -14,4 +14,5 @@
         "pkg-config"
         "grep"
         "findutils"
+	"openmpi"
         "sed"))
diff --git a/include/scalfmm/algorithms/mpi/direct.hpp b/include/scalfmm/algorithms/mpi/direct.hpp
index d08d8fe4df9a7f5f177d7e5dfb14c4f00c0a702e..690c5e57df85a26d5a567b5deec63ae46604801b 100644
--- a/include/scalfmm/algorithms/mpi/direct.hpp
+++ b/include/scalfmm/algorithms/mpi/direct.hpp
@@ -23,7 +23,7 @@ namespace scalfmm::algorithms::mpi::pass
 	*   step 3 send/receive the particles
 	*
 	* @tparam TreeS
-        *
+    *
 	* @param tree_source  source tree (contains the particles)
 	*/
         template<typename TreeS>
@@ -33,134 +33,159 @@ namespace scalfmm::algorithms::mpi::pass
             auto comm = para.get_communicator();
             auto rank = para.get_process_id();
             auto nb_proc = para.get_num_processes();
-            if(nb_proc == 1)
+            if(nb_proc > 1)
             {   // Openmp case -> no communication
-                return;
-            }
-            //
-            using grp_access_type = std::pair<decltype(tree_source.begin_leaves()), int>;
-            using mortonIdx_type = std::int64_t;
-            //
-            std::vector<std::vector<grp_access_type>> leaf_to_receive_access(nb_proc);
-            std::vector<std::vector<grp_access_type>> leaf_to_send_access(nb_proc);
-            std::vector<std::vector<mortonIdx_type>> morton_to_receive(nb_proc);   // TOREMOVE
-            std::vector<int> nb_messages_to_send(nb_proc, 0);
-            std::vector<int> nb_messages_to_receive(nb_proc, 0);
-            ///
-            auto begin_left_ghost = tree_source.begin_leaves();
-            auto end_left_ghost = tree_source.begin_mine_leaves();
-            auto begin_right_ghost = tree_source.end_mine_leaves();
-            auto end_right_ghost = tree_source.end_leaves();
-            //
-            //print leaf block
-            // for(auto it = end_left_ghost; it != begin_right_ghost; ++it)
-            // {
-            //     std::cout << **it << std::endl;
-            // }
-            //
-            auto const& leaf_distribution = tree_source.get_leaf_distribution();
-
-            scalfmm::parallel::comm::start_step1(comm, begin_left_ghost, end_left_ghost, begin_right_ghost,
-                                                 end_right_ghost, leaf_distribution, nb_messages_to_receive,
-                                                 nb_messages_to_send, leaf_to_receive_access, morton_to_receive);
-            // for(auto p = 0; p < nb_proc; ++p)
-            // {
-            //     io::print("    morton to receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
-            // }
-            // io::print("    nb_messages_to_receive ", nb_messages_to_receive);
-            // io::print("    nb_messages_to_send ", nb_messages_to_send);
-            //
-            std::vector<std::vector<mortonIdx_type>> morton_to_send(nb_proc);
-            //
-            scalfmm::parallel::comm::start_step2(nb_proc, rank, comm, nb_messages_to_receive, nb_messages_to_send,
-                                                 morton_to_receive, morton_to_send);
-
-            /////////////////////////////////////////////////////////////////////////////////
-            /// STEP 3
-            /////////////////////////////////////////////////////////////////////////////////
-            // send the particles
-            // morton_to_send list des indices de Morton.
-            // leaf_to_send_access (ptr on the group and index into  the group)
-            //
-            auto begin_grp = tree_source.begin_mine_leaves();
-            auto end_grp = tree_source.end_mine_leaves();
-
-            scalfmm::parallel::comm::build_direct_access_to_leaf(nb_proc, begin_grp, end_grp, leaf_to_send_access,
-                                                                 morton_to_send);
-            //
-            // Build the mpi type for the particles
-            //
-            static constexpr std::size_t dimension = TreeS::base_type::leaf_type::dimension;
-            static constexpr std::size_t inputs_size = TreeS::base_type::leaf_type::inputs_size;
-
-            using position_coord_type = typename TreeS::base_type::leaf_type::position_coord_type;
-            using inputs_type_ori = typename TreeS::base_type::leaf_type::inputs_type;
-
-            static_assert(!meta::is_complex_v<inputs_type_ori>, "input complex type not yet supported.");
-            using inputs_type1 = inputs_type_ori;
-            using inputs_type = std::conditional_t<meta::is_complex_v<inputs_type_ori>,
-                                                   meta::has_value_type_t<inputs_type_ori>, inputs_type_ori>;
-            // for complex value (2) otherwise 1  NOT YET USED for particles
-            int nb_input_values = meta::is_complex_v<inputs_type_ori> ? 2 : 1;
-
-            auto mpi_position_type = cpp_tools::parallel_manager::mpi::get_datatype<position_coord_type>();
-            auto mpi_input_type = cpp_tools::parallel_manager::mpi::get_datatype<inputs_type>();
-            //
-
-            // build and commit the MPI type of the particle to send
-            // std::cout << "=================== Send type ========================\n";
-
-            auto particle_type_to_send = scalfmm::parallel::comm::build_mpi_particles_type<dimension>(
-              leaf_to_send_access, inputs_size, mpi_position_type, mpi_input_type);
-
-            // send the particles
-            for(auto p = 0; p < nb_proc; ++p)
-            {
-                if(leaf_to_send_access[p].size() != 0)
+
+                //
+                using grp_access_type = std::pair<decltype(tree_source.begin_leaves()), int>;
+                using mortonIdx_type = std::int64_t;
+                //
+                std::vector<std::vector<grp_access_type>> leaf_to_receive_access(nb_proc);
+                std::vector<std::vector<grp_access_type>> leaf_to_send_access(nb_proc);
+                std::vector<std::vector<mortonIdx_type>> morton_to_receive(nb_proc);   // TOREMOVE
+                std::vector<int> nb_messages_to_send(nb_proc, 0);
+                std::vector<int> nb_messages_to_receive(nb_proc, 0);
+                ///
+                auto begin_left_ghost = tree_source.begin_leaves();
+                auto end_left_ghost = tree_source.begin_mine_leaves();
+                auto begin_right_ghost = tree_source.end_mine_leaves();
+                auto end_right_ghost = tree_source.end_leaves();
+                //
+                //print leaf block
+                // for(auto it = end_left_ghost; it != begin_right_ghost; ++it)
+                // {
+                //     std::cout << **it << std::endl;
+                // }
+                //
+                auto const& leaf_distribution = tree_source.get_leaf_distribution();
+
+                scalfmm::parallel::comm::start_step1(comm, begin_left_ghost, end_left_ghost, begin_right_ghost,
+                                                     end_right_ghost, leaf_distribution, nb_messages_to_receive,
+                                                     nb_messages_to_send, leaf_to_receive_access, morton_to_receive);
+                // for(auto p = 0; p < nb_proc; ++p)
+                // {
+                //     io::print("    morton to receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
+                // }
+                // io::print("    nb_messages_to_receive ", nb_messages_to_receive);
+                // io::print("    nb_messages_to_send ", nb_messages_to_send);
+                //
+                std::vector<std::vector<mortonIdx_type>> morton_to_send(nb_proc);
+                //
+                scalfmm::parallel::comm::start_step2(nb_proc, rank, comm, nb_messages_to_receive, nb_messages_to_send,
+                                                     morton_to_receive, morton_to_send);
+
+                /////////////////////////////////////////////////////////////////////////////////
+                /// STEP 3
+                /////////////////////////////////////////////////////////////////////////////////
+                // send the particles
+                // morton_to_send list des indices de Morton.
+                // leaf_to_send_access (ptr on the group and index into  the group)
+                //
+                auto begin_grp = tree_source.begin_mine_leaves();
+                auto end_grp = tree_source.end_mine_leaves();
+
+                scalfmm::parallel::comm::build_direct_access_to_components(nb_proc, begin_grp, end_grp,
+                                                                           leaf_to_send_access, morton_to_send);
+                //
+                // Build the mpi type for the particles
+                //
+                static constexpr std::size_t dimension = TreeS::base_type::leaf_type::dimension;
+                static constexpr std::size_t inputs_size = TreeS::base_type::leaf_type::inputs_size;
+
+                using position_coord_type = typename TreeS::base_type::leaf_type::position_coord_type;
+                using inputs_type_ori = typename TreeS::base_type::leaf_type::inputs_type;
+
+                static_assert(!meta::is_complex_v<inputs_type_ori>, "input complex type not yet supported.");
+                using inputs_type1 = inputs_type_ori;
+                using inputs_type = std::conditional_t<meta::is_complex_v<inputs_type_ori>,
+                                                       meta::has_value_type_t<inputs_type_ori>, inputs_type_ori>;
+                // for complex value (2) otherwise 1  NOT YET USED for particles
+                int nb_input_values = meta::is_complex_v<inputs_type_ori> ? 2 : 1;
+
+                auto mpi_position_type = cpp_tools::parallel_manager::mpi::get_datatype<position_coord_type>();
+                auto mpi_input_type = cpp_tools::parallel_manager::mpi::get_datatype<inputs_type>();
+                //
+
+                // build and commit the MPI type of the particle to send
+                // std::cout << "=================== Send type ========================\n";
+
+                auto particle_type_to_send = scalfmm::parallel::comm::build_mpi_particles_type<dimension>(
+                  leaf_to_send_access, inputs_size, mpi_position_type, mpi_input_type);
+
+                // send the particles
+                for(auto p = 0; p < nb_proc; ++p)
                 {
-                    comm.isend(MPI_BOTTOM, 1, particle_type_to_send[p], p, 777);
+                    if(leaf_to_send_access[p].size() != 0)
+                    {
+                        comm.isend(MPI_BOTTOM, 1, particle_type_to_send[p], p, 777);
+                    }
                 }
-            }
-            //
-            // receive the particle
-            std::vector<cpp_tools::parallel_manager::mpi::request> recept_mpi_status;
-            // build and commit the MPI type of the particle to receive
-            // std::cout << "=================== Receive type ========================\n";
-
-            auto particle_type_to_receive = scalfmm::parallel::comm::build_mpi_particles_type<dimension>(
-              leaf_to_receive_access, inputs_size, mpi_position_type, mpi_input_type);
-
-            for(auto p = 0; p < nb_proc; ++p)
-            {
-                if(leaf_to_receive_access[p].size() != 0)
+                //
+                // receive the particle
+                std::vector<cpp_tools::parallel_manager::mpi::request> recept_mpi_status;
+                // build and commit the MPI type of the particle to receive
+                // std::cout << "=================== Receive type ========================\n";
+
+                auto particle_type_to_receive = scalfmm::parallel::comm::build_mpi_particles_type<dimension>(
+                  leaf_to_receive_access, inputs_size, mpi_position_type, mpi_input_type);
+
+                for(auto p = 0; p < nb_proc; ++p)
                 {
-                    recept_mpi_status.push_back(comm.irecv(MPI_BOTTOM, 1, particle_type_to_receive[p], p, 777));
+                    if(leaf_to_receive_access[p].size() != 0)
+                    {
+                        recept_mpi_status.push_back(comm.irecv(MPI_BOTTOM, 1, particle_type_to_receive[p], p, 777));
+                    }
                 }
-            }
-            if(recept_mpi_status.size() > 0)
-            {
-                cpp_tools::parallel_manager::mpi::request::waitall(recept_mpi_status.size(), recept_mpi_status.data());
+                if(recept_mpi_status.size() > 0)
+                {
+                    cpp_tools::parallel_manager::mpi::request::waitall(recept_mpi_status.size(),
+                                                                       recept_mpi_status.data());
+                }
+
+                //print leaf block
+                // std::cout << "==========================================================\n";
+                // int id_group{0};
+                // for(auto ptr_group = begin_left_ghost; ptr_group != end_right_ghost; ++ptr_group)
+                // {
+                //     auto const& current_group_symbolics = (*ptr_group)->csymbolics();
+
+                //     std::cout << "*** Group of leaf index " << ++id_group << " *** index in ["
+                //               << current_group_symbolics.starting_index << ", " << current_group_symbolics.ending_index
+                //               << "[";
+                //     std::cout << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
+                //     std::cout << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", ";
+                //     std::cout << "global index =  " << current_group_symbolics.idx_global << " \n";
+                //     std::cout << "    index: ";
+                //     (*ptr_group)->cstorage().print_block_data(std::cout);
+                // }
             }
 
-            //print leaf block
-            // std::cout << "==========================================================\n";
-            // int id_group{0};
-            // for(auto ptr_group = begin_left_ghost; ptr_group != end_right_ghost; ++ptr_group)
-            // {
-            //     auto const& current_group_symbolics = (*ptr_group)->csymbolics();
-
-            //     std::cout << "*** Group of leaf index " << ++id_group << " *** index in ["
-            //               << current_group_symbolics.starting_index << ", " << current_group_symbolics.ending_index
-            //               << "[";
-            //     std::cout << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
-            //     std::cout << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", ";
-            //     std::cout << "global index =  " << current_group_symbolics.idx_global << " \n";
-            //     std::cout << "    index: ";
-            //     (*ptr_group)->cstorage().print_block_data(std::cout);
-            // }
-        }   // en d start_communications
+            // #ifndef _DEBUG_BLOCK_DATA
+            //             std::clog << "  FINAl block\n";
+            //             int id_group{0};
+            //             auto group_of_leaves = tree_source.vector_of_leaf_groups();
+            //             for(auto pg: group_of_leaves)
+            //             {
+            //                 auto const& current_group_symbolics = pg->csymbolics();
+            //                 std::cout << "*** Group of leaf index " << ++id_group << " *** index in ["
+            //                           << current_group_symbolics.starting_index << ", " << current_group_symbolics.ending_index
+            //                           << "[";
+            //                 std::cout << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
+            //                 std::cout << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", ";
+            //                 std::cout << "global index =  " << current_group_symbolics.idx_global << " \n" << std::flush;
+            //                 // std::cout << "    index: ";
+            //                 // std::clog << "block index " << tt++ << std::endl;
+            //                 pg->cstorage().print_block_data(std::clog);
+            //             }
+            //             std::clog << "  ---------------------------------------------------\n";
+            // #endif
+        }   // end start_communications
     }   // namespace comm
 
+    // template<typename Tree, typename NearField>
+    // inline auto direct_mine_ghost(Tree const& tree, NearField const& nearfield) -> void
+    // {
+    // }
     /**
      * @brief Compute direct interaction between particles
      *
@@ -181,7 +206,11 @@ namespace scalfmm::algorithms::mpi::pass
         comm::start_communications(tree_source);
         // std::cout << "   end comm  " << std::endl << std::flush;
         // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
-        scalfmm::algorithms::omp::pass::direct(tree_source, tree_target, near_field);
+        omp::pass::direct(tree_source, tree_target, near_field);
+        // if(nearfield.mutual())
+        // {
+        //     direct_mine_ghost(tree_target, near_field);
+        // }
     }
 }   // namespace scalfmm::algorithms::mpi::pass
 
diff --git a/include/scalfmm/algorithms/mpi/downward.hpp b/include/scalfmm/algorithms/mpi/downward.hpp
index 71d648a16148e64158b05bb6906ce800cf96aa2a..1ed5e10910fb96fa29555f6cf0573c5b92512109 100644
--- a/include/scalfmm/algorithms/mpi/downward.hpp
+++ b/include/scalfmm/algorithms/mpi/downward.hpp
@@ -7,16 +7,47 @@
 
 #include "scalfmm/operators/l2l.hpp"
 #ifdef _OPENMP
-
-#include <omp.h>
-
 #include "scalfmm/algorithms/omp/downward.hpp"
+#include <omp.h>
 #endif   // _OPENMP
 
 #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 
 namespace scalfmm::algorithms::mpi::pass
 {
+    /// @brief Construct the vector of dependencies (child group)
+    /// @tparam IteratorType
+    /// @tparam MortonType
+    /// @tparam Dependencies_t
+    /// @tparam dimension
+    /// @param begin first iterator on the groups of cells (child)
+    /// @param end  last iterator on the groups of cells (child)
+    /// @param parent_morton_index the parent index
+    /// @param dependencies  the vector of dependencies
+    template<int dimension, typename IteratorType, typename MortonType, typename Dependencies_t>
+    void build_downward_dependencies(IteratorType begin, IteratorType end, MortonType const& parent_morton_index,
+                                     Dependencies_t& dependencies)
+    {
+        for(auto grp_ptr = begin; grp_ptr != end; ++grp_ptr)
+        {
+            auto const& csymb = (*grp_ptr)->csymbolics();
+            // iterate on the cells in the same group
+            // we move forward in the index vector
+            // std::cout << "[" << csymb.starting_index << " < " << parent_morton_index << " < " << csymb.ending_index
+            //           << "] ?" << std::endl
+            //           << std::flush;
+            if(parent_morton_index == ((csymb.ending_index - 1) >> dimension))
+            {
+                // std::cout << parent_morton_index << " add depend for grp with Int [" << csymb.starting_index << ", "
+                //           << csymb.ending_index << "]" << std::endl;
+                dependencies.push_back(&(grp_ptr->get()->ccomponent(0).clocals(0)));
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
     /**
      * @brief perform the l2l communications for the father level
      *
@@ -28,86 +59,169 @@ namespace scalfmm::algorithms::mpi::pass
     template<typename Tree>
     inline auto downward_communications_level(const int& level, Tree& tree) -> void
     {
+        //  value_type value of the local array
         using value_type = typename Tree::base_type::cell_type::value_type;
-        static constexpr int nb_inputs = Tree::cell_type::storage_type::inputs_size;
-        static constexpr std::size_t dimension = Tree::base_type::box_type::dimension;
+        using dep_type = typename Tree::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
 
+        static constexpr int nb_outputs = Tree::cell_type::storage_type::outputs_size;
+        static constexpr std::size_t dimension = Tree::base_type::box_type::dimension;
+        // number of theoretical children
+        constexpr int nb_children = math::pow(2, dimension);
+        static constexpr auto prio{omp::priorities::max};
+        //
         auto child_level = level + 1;
-        auto const& distrib = tree.get_cell_distribution(child_level);
 
-        // compute the size of the multipoles to send (generic) versus  math::pow(order, dimension)
-        auto it_group = tree.end_mine_cells(level) - 1;       // last group the I own
-        auto pos = it_group->get()->size() - 1;               // index of the last cell in the group
-        auto const& cell = it_group->get()->component(pos);   // the cell
-
-        auto const& m = cell.cmultipoles();
-        auto size{int(nb_inputs * m.at(0).size())};
+        // compute the size of the locals to send (generic) versus  math::pow(order, dimension)
+        auto it_last_parent_group = tree.end_mine_cells(level) - 1;       // last group  I own father
+        auto pos = it_last_parent_group->get()->size() - 1;               // index of the last cell
+        auto const& cell = it_last_parent_group->get()->component(pos);   // the cell
+        auto const& m = cell.clocals();
+        auto size_local{int(nb_outputs * m.at(0).size())};   // size of a local
+        //
         // For the communications
         auto& para = tree.get_parallel_manager();
-        auto comm = para.get_communicator();
-        auto rank = comm.rank();
-        int nb_proc = comm.size();
+        auto* comm = &(para.get_communicator());
+        auto rank = comm->rank();
+        int nb_proc = comm->size();
         int tag_data = 2201 + 10 * level;
-
-        // Send
+        std::vector<dep_type> dependencies_in;
+        //
+        auto ptr_tree = &tree;
+        auto const& distrib = tree.get_cell_distribution(child_level);
+        // std::clog << "distrib me [" << distrib[rank][0] << "," << distrib[rank][1] << "]\n";
+        // Send to the right the last locals
         if(rank != nb_proc - 1)
         {
+            // std::clog << "   Send step " << level << "\n";
+            // get the  distribution at child  level
             auto last_child_index = distrib[rank][1] - 1;
             auto first_child_index_after_me = distrib[rank + 1][0];
-            // dependencies in on th group
-            if((last_child_index >> dimension) == (first_child_index_after_me >> dimension))
+            // dependencies in on the group
+            // Check if the last mine and the first right ghost have the same father
+            auto parent_morton_index = last_child_index >> dimension;
+            // std::clog << " downward last_child_index " << last_child_index << " its parent " << parent_morton_index
+            //           << "  first_child_index_after_me " << first_child_index_after_me << " its parent "
+            //           << (first_child_index_after_me >> dimension) << std::endl
+            //           << std::flush;
+            if(parent_morton_index == (first_child_index_after_me >> dimension))
             {
-                std::vector<value_type> buffer(size);
+                // Two processes share the same parent
+                // iterator on the my first child
+                auto first_group_of_child = tree.begin_mine_cells(child_level);
+                auto first_index_child = first_group_of_child->get()->component(0).index();
+                auto parent_of_last_index_child = first_index_child >> dimension;
 
-                // I have to send a message from my right to update the multipoles of the first
-                // cells of the right ghosts.
-                // temporary buffer
+                std::cout << std::flush;
+                // dependencies on the parent group
+                auto dep_parent = &(it_last_parent_group->get()->ccomponent(0).clocals(0));
+                // std::cout << " downward dep(in) on groupe dep_parent  " << dep_parent << std::endl << std::flush;
+                // depend(iterator(std::size_t it = 0 dependencies.size()), inout : (dependencies[it])[0]),
 
-                auto nb_m = m.size();
-                auto it = std::begin(buffer);
-                for(std::size_t i{0}; i < nb_m; ++i)
+#pragma omp task default(none) firstprivate(comm, rank, tag_data, it_last_parent_group, last_child_index)              \
+  shared(std::clog) depend(in : dep_parent[0], ptr_tree[0]) priority(prio)
                 {
-                    auto const& ten = m.at(i);
-                    std::copy(std::begin(ten), std::end(ten), it);
-                    it += ten.size();
-                }
-                auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+                    // I have to send a message from my right to update the multipoles of the first
+                    // cells of the right ghosts.
+                    // temporary buffer
+                    auto pos = it_last_parent_group->get()->size() - 1;
+                    auto const& cell = it_last_parent_group->get()->component(pos);   // the cell
 
-                comm.isend(buffer.data(), size, mpi_type, rank + 1, tag_data);
-            }
-        }
+                    auto const& m = cell.clocals();
+                    auto size_local{int(nb_outputs * m.at(0).size())};
+                    auto nb_m = m.size();
+                    std::vector<value_type> buffer(size_local);
+
+                    // std::cout << "cell index: " << cell.index() << " = parent " << (last_child_index >> dimension)
+                    //           << "\n";
+                    // loop to serialize the locals
+                    auto it = std::begin(buffer);
+                    for(std::size_t i{0}; i < nb_m; ++i)
+                    {
+                        auto const& ten = m.at(i);
+                        std::copy(std::begin(ten), std::end(ten), it);
+                        it += ten.size();
+                    }
+                    // io::print("buffer(send) ", buffer);
+
+                    auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+                    // std::clog << "   send buffer to " << rank + 1 << std::endl;
+                    comm->isend(buffer.data(), size_local, mpi_type, rank + 1, tag_data);
+                    // std::cout << " downward(task_send)  buffer(rank=" << std::to_string(rank) << "): " << std::flush;
+                    // for(int i = 0; i < buffer.size(); ++i)
+                    // {
+                    //     std::cout << "  " << buffer[i] << std::flush;
+                    // }
+                    // std::cout << std::endl << std::flush;
+                }   // end task
+            }   // end same parent
+        }   // end rank !- proc -1
         // Receive
         if(rank > 0)
         {
+            // std::clog << "Receive step\n";
+
             auto last_child_index_before_me = distrib[rank - 1][1] - 1;
             auto first_child_index = distrib[rank][0];
             // dependencies out on the group
-
+            // check if same parent
+            // std::clog << "downward receive comm  last_child_index_before_me " << last_child_index_before_me
+            //           << " parent " << (last_child_index_before_me >> dimension) << " first_child_index "
+            //           << first_child_index << " its parent " << (first_child_index >> dimension) << std::endl
+            //           << std::flush;
             if((last_child_index_before_me >> dimension) == (first_child_index >> dimension))
             {
-                std::vector<value_type> buffer(size);
-
-                // I have to receive a message from my left to update the multipoles of the last
-                // cells of the left ghosts.
-                auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
-                comm.recv(buffer.data(), size, mpi_type, rank - 1, tag_data);
-
-                /// set the multipoles in the ghost
-                auto it_group = tree.begin_mine_cells(level) - 1;   // last left ghosts
-                auto pos = it_group->get()->size() - 1;             // index of the last cell in the group
-                auto& cell = it_group->get()->component(pos);
-                auto& m = cell.multipoles();
-
-                auto nb_m = m.size();
-                auto it = std::begin(buffer);
-                for(std::size_t i{0}; i < nb_m; ++i)
+                // task to do
+                // std::cout << " downward receive task to do perform " << std::endl << std::flush;
+
+                // dependencies on left ghost parent
+                auto gs = it_last_parent_group->get()->size();
+                // std::cout << " gs = " << gs << std::endl << std::flush;
+                int nb_grp_dep =
+                  std::min(static_cast<int>(nb_children / gs + 1),
+                           static_cast<int>(std::distance(it_last_parent_group, tree.end_cells(child_level))));
+                auto it_last_parent_group = tree.begin_mine_cells(level) - 1;
+                auto dep_ghost_parent = &(it_last_parent_group->get()->ccomponent(0).clocals(0));
+                // std::cout << " downward(receive) dependencies(out): " << dep_ghost_parent << std::endl << std::flush;
+
+#pragma omp task default(none) firstprivate(comm, rank, tag_data, size_local, it_last_parent_group) shared(std::clog)  \
+  depend(out : dep_ghost_parent[0], ptr_tree[0]) priority(prio)
                 {
-                    auto& ten = m.at(i);
-                    std::copy(it, it + ten.size(), std::begin(ten));
-                    it += ten.size();
-                }
-            }
-        }
+                    // std::clog << "      Same parent\n ";
+                    // Same parent, I have to receive a message from my left
+                    // to update the locals of the last cells of the left ghosts.
+                    std::vector<value_type> buffer(size_local);
+                    // blocking receive ( We block the submission of L2L tasks at this level )
+                    auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+                    comm->recv(buffer.data(), size_local, mpi_type, rank - 1, tag_data);
+                    // std::cout << " downward(task receive) buffer:";
+                    // for(int i = 0; i < buffer.size(); ++i)
+                    // {
+                    //     std::cout << "  " << buffer[i];
+                    // }
+                    // std::cout << std::endl;
+                    /// set the locals in the last left ghosts and in last cell
+                    auto it_group = it_last_parent_group;
+                    auto pos = it_group->get()->size() - 1;   // index of the last cell in the group
+                    auto& cell = it_group->get()->component(pos);
+                    auto& m = cell.locals();
+                    // std::clog << "cell index: " << cell.index() << " = parent " << (cell.index() >> dimension) << "\n";
+
+                    auto nb_m = m.size();
+                    // std::cout << " cell index: " << cell.index() << " level " << cell.csymbolics().level << "\n";
+                    // io::print("buffer(recv) ", buffer);
+                    auto it = std::begin(buffer);
+                    for(std::size_t i{0}; i < nb_m; ++i)
+                    {
+                        auto& ten = m.at(i);
+                        // std::cout << " ten before " << ten << std::endl;
+                        std::copy(it, it + ten.size(), std::begin(ten));
+                        // std::transform(it, it + ten.size(), std::begin(ten), std::begin(ten), std::plus<>{});
+                        // std::cout << " ten after " << ten << std::endl;
+                        it += ten.size();
+                    }
+                }   // end task
+            }   // end same parent
+        }   // end rank > 0
     }
 
     /**
@@ -128,8 +242,14 @@ namespace scalfmm::algorithms::mpi::pass
 
         for(std::size_t level = top_height; level < leaf_level; ++level)
         {
+            // std::cout << "   L2L downward  : " << level << " -> " << level + 1 << std::endl << std::flush;
+            // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
+            // update the ghost at the current level (father)
             downward_communications_level(level, tree);
+            // std::cout << "   end downward comm  " << level << std::endl << std::flush;
+            // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
 
+            //  compute at the level
             omp::pass::downward_level(level, tree, approximation);
         }
     }
diff --git a/include/scalfmm/algorithms/mpi/proc_task.hpp b/include/scalfmm/algorithms/mpi/proc_task.hpp
index bea9aef9bc125696b0479c474b996dba68e02f1f..798cc5ead69f54bea23c27c9e8f3c5c186b20ab5 100644
--- a/include/scalfmm/algorithms/mpi/proc_task.hpp
+++ b/include/scalfmm/algorithms/mpi/proc_task.hpp
@@ -74,7 +74,7 @@ namespace scalfmm::algorithms::mpi
                           << "WARNING the task priorities are not (fully) available. set OMP_MAX_TASK_PRIORITY to "
                           << omp::priorities::max + 1 << cpp_tools::colors::reset << std::endl;
             }
-
+            tree_target.get_parallel_manager().get_communicator().barrier();
             const auto op = tree_target.height() == 2 ? operators_to_proceed::p2p : op_in;
 
             if constexpr(options::has(s, options::timit))
@@ -94,18 +94,23 @@ namespace scalfmm::algorithms::mpi
                     }
                     if((op & operators_to_proceed::p2p) == operators_to_proceed::p2p)
                     {
+                        // std::cout << "pass::direct \n";
                         pass::direct(tree_source, tree_target, near_field);
                     }
+
                     if(tree_target.is_interaction_m2l_lists_built() == false)
                     {
                         list::omp::build_m2l_interaction_list(tree_source, tree_target, separation_criterion);
                     }
+
                     if((op & operators_to_proceed::p2m) == operators_to_proceed::p2m)
                     {
+                        // std::cout << "pass::leaf_to_cell \n";
                         scalfmm::algorithms::omp::pass::leaf_to_cell(tree_source, far_field);
                     }
                     if((op & operators_to_proceed::m2m) == operators_to_proceed::m2m)
                     {
+                        // std::cout << "pass::upward \n";
                         pass::upward(tree_source, far_field.approximation());
                     }
                     // if(same_tree && tree_target.box().is_periodic())
@@ -116,16 +121,21 @@ namespace scalfmm::algorithms::mpi
                     // }
                     if((op & operators_to_proceed::m2l) == operators_to_proceed::m2l)
                     {
+                        // std::cout << "pass::transfer   remove_leaf_level \n";
                         pass::transfer(tree_source, tree_target, far_field, buffers,
                                        scalfmm::algorithms::omp::pass::split_m2l::remove_leaf_level);
                     }
                     if((op & operators_to_proceed::l2l) == operators_to_proceed::l2l)
                     {
+                        // std::cout << "pass::downward\n";
+
                         pass::downward(tree_target, far_field.approximation());
                     }
 
                     if((op & operators_to_proceed::m2l) == operators_to_proceed::m2l)
                     {
+                        // std::cout << "pass::transfer   leaf_level \n";
+
                         pass::transfer(tree_source, tree_target, far_field, buffers,
                                        scalfmm::algorithms::omp::pass::split_m2l::leaf_level);
                     }
@@ -136,6 +146,26 @@ namespace scalfmm::algorithms::mpi
                 }
             }   // end parallel
 
+#ifdef _DEBUG_BLOCK_DATA
+            std::clog << "\n";
+            std::clog << "  FINAl block (end proc_task)\n";
+            int id_group{0};
+            auto group_of_leaves = tree_source.vector_of_leaf_groups();
+            for(auto pg: group_of_leaves)
+            {
+                auto const& current_group_symbolics = pg->csymbolics();
+                std::clog << "*** Group of leaf index " << ++id_group << " *** index in ["
+                          << current_group_symbolics.starting_index << ", " << current_group_symbolics.ending_index
+                          << "[";
+                std::clog << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
+                std::clog << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", ";
+                std::clog << "global index =  " << current_group_symbolics.idx_global << " \n" << std::flush;
+                // std::cout << "    index: ";
+                // std::clog << "block index " << tt++ << std::endl;
+                pg->cstorage().print_block_data(std::clog);
+            }
+            std::clog << "  ---------------------------------------------------\n";
+#endif
             scalfmm::algorithms::omp::impl::delete_buffers(buffers);
 
             if constexpr(options::has(s, options::timit))
diff --git a/include/scalfmm/algorithms/mpi/transfer.hpp b/include/scalfmm/algorithms/mpi/transfer.hpp
index 665fbc036cba80ea6d72a9003a5de55c8ccd1846..5cb2dd20277d1b0981d0c18ac5ddc72aec26dcec 100644
--- a/include/scalfmm/algorithms/mpi/transfer.hpp
+++ b/include/scalfmm/algorithms/mpi/transfer.hpp
@@ -1,12 +1,18 @@
 // --------------------------------
 // See LICENCE file at project root
-// File : scalfmm/algorithms/mpi/transfer.hpp
+// File : algorithm/omp/upward.hpp
 // --------------------------------
 #ifndef SCALFMM_ALGORITHMS_MPI_TRANSFER_HPP
 #define SCALFMM_ALGORITHMS_MPI_TRANSFER_HPP
 
 #ifdef _OPENMP
 
+#include <iostream>
+#include <map>
+#include <omp.h>
+#include <ostream>
+#include <utility>
+
 #include "scalfmm/algorithms/omp/transfer.hpp"
 #include "scalfmm/meta/traits.hpp"
 #include "scalfmm/operators/tags.hpp"
@@ -19,12 +25,596 @@
 
 #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 
-#include <map>
-#include <omp.h>
-#include <utility>
-
 namespace scalfmm::algorithms::mpi::pass
 {
+    template<typename ContainerType>
+    void print_access(std::ostream& out, ContainerType& access)
+    {
+        // for(auto p = 0; p < cells_to_access.size(); ++p)
+        // {
+        //     auto& access = cells_to_access[p];
+
+        out << "task_send_multipole_at_level for process   size " << access.size() << std::endl << std::flush;
+        for(auto i = 0; i < access.size(); ++i)
+        {
+            out << "   task_send  " << i << "  ptr  " << std::flush << access[i].first->get() << "  morton "
+                << (*(access[i].first))->component(access[i].second).csymbolics().morton_index << std::flush
+                << "multipole " << (*(access[i].first))->component(access[i].second).transfer_multipoles().at(0)
+                << std::endl
+                << std::flush;
+        }
+        out << "---------------------------" << std::endl << std::flush;
+        // }
+    }
+    /**
+     * @brief Build the buffer of multipole (no more used)
+     * 
+     * @tparam IteratorType 
+     * @tparam StructMorton 
+     * @tparam BufferType 
+     */
+    template<typename IteratorType, typename VectorOfVectorMortonType, typename BufferType>
+    auto build_buffer_func(IteratorType first_group_ghost, IteratorType last_group_ghost,
+                           VectorOfVectorMortonType const& index_to_send, BufferType& buffer)
+    {
+        try
+        {
+            //		    std::cout <<  " -----      build_buffer   ----\n" <<std::flush;
+            //		    std::cout <<  " -----      buffer size " <<buffer.size() <<std::endl<<std::flush;
+            //		    std::cout <<  " -----     index_to_send size " <<index_to_send.size() <<std::endl<<std::flush;
+            int idx{0};
+            int max_idx = index_to_send.size();   // loop on the groups
+            auto it = std::begin(buffer);
+            for(auto grp_ptr = first_group_ghost; grp_ptr != last_group_ghost; ++grp_ptr)
+            {
+                int start_grp{0};
+
+                auto const& csymb = (*grp_ptr)->csymbolics();
+                // iterate on the cells
+                while(idx < max_idx and math::between(index_to_send[idx], csymb.starting_index, csymb.ending_index))
+                {   // find cell inside the group
+                    int pos{-1};
+                    for(int i = start_grp; i < (*grp_ptr)->size(); ++i)
+                    {
+                        auto morton = (*grp_ptr)->component(i).csymbolics().morton_index;
+                        if(index_to_send[idx] == morton)
+                        {
+                            pos = i;
+                            start_grp = i + 1;
+                            // std::cout << "   pos = " << pos << std::endl;
+                            break;
+                        }
+                    }
+                    //			     std::cout << " morton to find " << index_to_send[idx] << " cell found "
+                    //			               << (*grp_ptr)->component(pos).csymbolics().morton_index << '\n';
+                    auto const& cell = (*grp_ptr)->component(pos);
+                    auto const& m = cell.ctransfer_multipoles();
+                    auto nb_m = m.size();
+                    //			    std::cout << "          nb_m" <<  m.size() <<std::endl;
+                    for(std::size_t i{0}; i < nb_m; ++i)
+                    {
+                        auto const& ten = m.at(i);
+                        std::copy(std::begin(ten), std::end(ten), it);
+                        it += ten.size();
+                    }
+                    ++idx;
+                }
+            }
+            //		                  std::cout <<  " -----      build_buffer   ----\n" <<std::flush;
+        }
+        catch(std::exception& e)
+        {
+            std::cout << " error in buffer building !!!!!!!!!\n";
+            std::cout << e.what() << '\n' << std::flush;
+        }
+    }
+    template<typename BufferType, typename AccessType>
+    auto build_buffer(const int nb_inputs, AccessType const& cells_to_send_access) -> BufferType
+    {
+        BufferType buffer;
+        try
+        {
+            //number of cells x nb inputs x size of an input
+
+            auto const& cell = (cells_to_send_access[0].first->get())->component(cells_to_send_access[0].second);
+            const int multipoleSize{int(cell.transfer_multipoles().at(0).size())};
+
+            const int buffer_size{int(cells_to_send_access.size()) * nb_inputs * multipoleSize};
+            buffer.resize(buffer_size);
+            std::cout << " -----      build_buffer   ----\n" << std::flush;
+            std::cout << " -----      buffer size " << buffer.size() << std::endl << std::flush;
+            auto it = std::begin(buffer);
+            // iterate on the cells
+            for(auto access: cells_to_send_access)
+            {
+                auto const& cell = (*(access.first))->component(access.second);
+
+                //			     std::cout << " morton to find " << index_to_send[idx] << " cell found "
+                //			               << (*grp_ptr)->component(pos).csymbolics().morton_index << '\n';
+                auto const& m = cell.transfer_multipoles();
+                auto nb_m = m.size();
+                //			    std::cout << "          nb_m" <<  m.size() <<std::endl;
+                for(std::size_t i{0}; i < nb_m; ++i)
+                {
+                    auto const& ten = m.at(i);
+                    std::copy(std::begin(ten), std::end(ten), it);
+                    it += ten.size();
+                }
+            }
+            std::cout << " -----      build_buffer   ----\n" << std::flush;
+            // io::print("buffer: ", buffer);
+        }
+        catch(std::exception& e)
+        {
+            std::cout << " error in buffer building !!!!!!!!!\n";
+            std::cout << e.what() << '\n' << std::flush;
+        }
+        return buffer;
+    }
+
+    /// @brief  Perform the communications to send the multipoles
+
+    /// @tparam valueType
+    /// @tparam TreeS      the tree type
+    /// @tparam VectorOfVectorMortonType
+    /// @param level    the level in the tree
+    /// @param tree     the tree containing the multipole to send
+    /// @param morton_to_send  the morton index of the cells containing the multipole
+    template<typename TreeS, typename VectorOfVectorMortonType>
+    void task_send_multipole_at_level(int const& level, TreeS& tree, VectorOfVectorMortonType const& morton_to_send)
+    {
+        static constexpr auto prio{omp::priorities::max};
+        static constexpr int nb_inputs = TreeS::cell_type::storage_type::inputs_size;
+        using multipole_type = typename TreeS::base_type::cell_type::storage_type::transfer_multipole_type;
+
+        using dependencies_type = typename TreeS::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
+        std::vector<dependencies_type> dependencies;
+
+        auto& para = tree.get_parallel_manager();
+        auto comm = para.get_communicator();
+        const auto nb_proc = para.get_num_processes();
+        const auto rank = para.get_process_id();
+
+        for(auto p = 0; p < nb_proc; ++p)
+        {
+            // std::cout << "  Morton to send to " << p << "   " << morton_to_send[p].size() << std::endl;
+            // io::print(" morton_to_send[" + std::to_string(p) + "]", morton_to_send[p]);
+
+            if(morton_to_send[p].size() > 0)
+            {
+                /// We first construct the in dependencies to ensure that multipoles
+                /// are updated by the  previous pass.
+                parallel::utils::build_multipoles_dependencies_from_morton_vector(
+                  tree.begin_mine_cells(level), tree.end_mine_cells(level), morton_to_send[p], dependencies);
+            }
+        }
+        std::sort(std::begin(dependencies), std::end(dependencies));
+        auto last = std::unique(std::begin(dependencies), std::end(dependencies));
+        dependencies.erase(last, dependencies.end());
+        io::print("M2L-old dependencies(send): ", dependencies);
+        // //
+        // // build direct access to the cells whose multipoles are to be sent
+        // //
+        // using grp_access_type = std::pair<decltype(tree.begin_cells(level)), int>;
+        // auto mpi_multipole_value_type = cpp_tools::parallel_manager::mpi::get_datatype<multipole_type>();
+        // std::vector<std::vector<grp_access_type>> cells_to_send_access(nb_proc);
+        // auto begin_grp = tree.begin_mine_cells(level);
+        // auto end_grp = tree.end_mine_cells(level);
+        // // std::clog << "          build_direct_access_to_components " << std::endl << std::flush;
+        // scalfmm::parallel::comm::build_direct_access_to_components(nb_proc, begin_grp, end_grp, cells_to_send_access,
+        //                                                            morton_to_send);
+
+        //     for(auto p = 0; p < nb_proc; ++p)
+        //   {
+        //         auto& access = cells_to_send_access[p];
+
+        //         std::cout << "task_send_multipole_at_level for process " << p << "  size " << access.size() << std::endl
+        //                   << std::flush;
+        //         for(auto i = 0; i < access.size(); ++i)
+        //         {
+        //             std::cout << "   task_send  " << i << "  ptr  " << std::flush << access[i].first->get() << "  index "
+        //                       << access[i].second << "  morton " << std::flush
+        //                       << (*(access[i].first))->component(access[i].second).csymbolics().morton_index
+        //                       << " multipoles "
+        //                       << (*(access[i].first))->component(access[i].second).transfer_multipoles().at(0) << std::endl
+        //                       << std::flush;
+        //         }
+        //         std::cout << "---------------------------" << std::endl << std::flush;
+        //     }
+        static constexpr std::size_t inputs_size = TreeS::base_type::cell_type::inputs_size;
+        //
+        // construct the MPI type to send the all multipoles
+        //
+        // tree.get_send_multipole_types(level) = scalfmm::parallel::comm::build_mpi_multipoles_type(
+        //   cells_to_send_access, inputs_size, mpi_multipole_value_type);
+
+        // auto const& multipole_type_to_send = tree.get_send_multipole_types(level);
+        // tree.print_send_multipole_types(level);
+        // for(auto p = 0; p < nb_proc; ++p)
+        // {
+        //     std::cout << " m2l(prep) ptr_data_type(" << p << ") " << &(multipole_type_to_send[p]) << " level: " << level
+        //               << std::endl
+        //               << std::flush;
+        // }
+        //
+        // std::clog << "          end build_mpi_multipoles_type " << std::endl << std::flush;
+        auto ptr_tree = &tree;
+//
+// task to perform on communications ,  shared(morton_to_send)
+#pragma omp task untied firstprivate(rank, nb_proc, level, dependencies, ptr_tree)                                     \
+  depend(iterator(std::size_t it = 0 : dependencies.size()), in : (dependencies[it])[0]) priority(prio)
+        {
+            std::vector<cpp_tools::parallel_manager::mpi::request> send_mpi_status;
+
+            std::cout << "m2l-old  task(send) " << std::endl << std::flush;
+            io::print("m2l-old  task(send) dependencies(in) ", dependencies);
+            // parallel::comm::print_all_cells(*ptr_tree, level, "M2L task(send)");
+            auto morton_to_send1 = ptr_tree->send_morton_indexes(level);
+
+            for(int p = 0; p < nb_proc; ++p)
+            {
+                io::print(" morton_to_send1-old  ", morton_to_send1[p]);
+            }
+            //
+            // build direct access to the cells whose multipoles are to be sent
+            //
+            using grp_access_type = std::pair<decltype(tree.begin_cells(level)), int>;
+            auto mpi_multipole_value_type = cpp_tools::parallel_manager::mpi::get_datatype<multipole_type>();
+            std::vector<std::vector<grp_access_type>> cells_to_send_access(nb_proc);
+            auto begin_grp = ptr_tree->begin_mine_cells(level);
+            auto end_grp = ptr_tree->end_mine_cells(level);
+            // std::clog << "          build_direct_access_to_components " << std::endl << std::flush;
+            scalfmm::parallel::comm::build_direct_access_to_components(nb_proc, begin_grp, end_grp,
+                                                                       cells_to_send_access, morton_to_send1);
+            // tree.get_send_multipole_types(level) = scalfmm::parallel::comm::build_mpi_multipoles_type(
+            //   cells_to_send_access, inputs_size, mpi_multipole_value_type);
+            auto multipole_type_to_send = scalfmm::parallel::comm::build_mpi_multipoles_type(
+              cells_to_send_access, inputs_size, mpi_multipole_value_type);
+            // auto const& multipole_type_to_send = ptr_tree->get_send_multipole_types(level);
+            tree.print_send_multipole_types(level);
+
+            for(auto p = 0; p < nb_proc; ++p)
+            {
+                if(morton_to_send1[p].size() > 0)
+                // if(multipole_type_to_send[p] != MPI_DATATYPE_NULL)
+                {
+                    // print_access(std::cout, cells_to_send_access[p]);
+
+                    std::cout << "m2l-old  task(send) send to " << p << std::endl << std::flush;
+                    std::cout << " m2l-old (task) ptr_data_type(" << p << ") " << &(multipole_type_to_send[p])
+                              << " level: " << level << std::endl
+                              << std::flush;
+
+                    send_mpi_status.push_back(comm.isend(MPI_BOTTOM, 1, multipole_type_to_send[p], p, 611));
+
+                    std::cout << " m2l(task)-old  end send to " << p << "\n" << std::flush;
+                }
+            }
+
+            std::cout << " m2l(task)-old  end task \n" << std::flush;
+
+        }   // end task
+    }
+    /// @brief  Receive the multipoles and put them in ghost groups
+    ///
+    /// @tparam TreeS
+    /// @tparam VectorOfVectorGroupAccessType
+    /// @param level  level ine tree
+    /// @param tree   the tree
+    /// @param cells_to_receive_access  Cell access vector (ptr on the gout, index inside it)
+    template<typename TreeS, typename VectorOfVectorGroupAccessType>
+    void task_receive_multipole_at_level(int const& level, TreeS& tree,
+                                         VectorOfVectorGroupAccessType const& cells_to_receive_access)
+    {
+        // We first construct the out dependencies (all the ghost groups (right and left))
+        //  naive version
+        //
+        auto& para = tree.get_parallel_manager();
+        auto comm = para.get_communicator();
+        const auto nb_proc = para.get_num_processes();
+        //
+        auto size_dep{std::distance(tree.begin_cells(level), tree.begin_mine_cells(level)) +
+                      std::distance(tree.end_mine_cells(level), tree.end_cells(level))};
+        using dependencies_type = typename TreeS::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
+        std::vector<dependencies_type> dependencies(size_dep);
+        int idx{0};
+        for(auto it_grp = tree.begin_cells(level); it_grp != tree.begin_mine_cells(level); ++it_grp, ++idx)
+        {
+            dependencies[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
+        }
+        for(auto it_grp = tree.end_mine_cells(level); it_grp != tree.end_cells(level); ++it_grp, ++idx)
+        {
+            dependencies[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
+        }
+        io::print("dependencies(recv)-old : ", dependencies);
+        //
+        // construct the MPI type to send the all multipoles
+        //
+        using multipole_type = typename TreeS::base_type::cell_type::storage_type::transfer_multipole_type;
+        auto mpi_multipole_value_type = cpp_tools::parallel_manager::mpi::get_datatype<multipole_type>();
+        static constexpr std::size_t inputs_size = TreeS::base_type::cell_type::inputs_size;
+        // auto multipole_type_to_receive = scalfmm::parallel::comm::build_mpi_multipoles_type(
+        //   cells_to_receive_access, inputs_size, mpi_multipole_value_type);
+        tree.get_receive_multipole_types(level) = scalfmm::parallel::comm::build_mpi_multipoles_type(
+          cells_to_receive_access, inputs_size, mpi_multipole_value_type);
+
+        // auto ptr_multipole_type_to_receive = tree.get_multipole_types(level).data();
+        //receive the multipoles
+        // tree.set_receive_access(to_receive);
+        static constexpr auto prio{omp::priorities::max};
+        auto ptr_tree = &tree;
+
+#pragma omp task firstprivate(nb_proc, ptr_tree)                                                                       \
+  depend(iterator(std::size_t it = 0 : dependencies.size()), inout : (dependencies[it])[0]) priority(prio)
+        {
+            std::cout << "M2L-old  task(transfer(receiv)) " << std::endl << std::flush;
+            io::print("M2L-old  transfer_comm(task) dependencies(in): ", dependencies);
+
+            std::vector<cpp_tools::parallel_manager::mpi::request> recept_mpi_status;
+            auto ptr_multipole_type_to_receive = ptr_tree->get_receive_multipole_types(level).data();
+            for(auto p = 0; p < nb_proc; ++p)
+            {
+                if(ptr_multipole_type_to_receive[p] != MPI_DATATYPE_NULL)
+                {
+                    recept_mpi_status.push_back(comm.irecv(MPI_BOTTOM, 1, ptr_multipole_type_to_receive[p], p, 611));
+                }
+            }
+            if(recept_mpi_status.size() > 0)
+            {
+                cpp_tools::parallel_manager::mpi::request::waitall(recept_mpi_status.size(), recept_mpi_status.data());
+                {
+                    std::cout << "M2L-old   --  level " << level << "   --  " << std::endl;
+                    scalfmm::component::for_each_mine_component(tree.begin_cells(level), tree.end_cells(level),
+                                                                [](auto const& cell)
+                                                                {
+                                                                    std::cout << "M2L task(end receive) cell index "
+                                                                              << cell.index() << "  multipoles "
+                                                                              << cell.transfer_multipoles().at(0)
+                                                                              << "  locals " << cell.locals().at(0)
+                                                                              << std::endl
+                                                                              << std::flush;
+                                                                });
+                }
+            }
+            std::clog << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&\n" << std::flush;
+
+        }   // end task
+    }
+
+    /// @brief generate a task to send/receive the multipoles
+    ///
+    /// The received multipoles are put in ghost groups.
+    /// @tparam TreeS
+    /// @param level the level in the tree
+    /// @param tree  /the tree containing the source particles
+    template<typename TreeS, typename TreeT>
+    void inline task_communications(const int level, TreeS& tree, TreeT& treeT)
+    {
+        // std::cout << "task_communications  fct start\n ";
+        static constexpr auto prio{omp::priorities::max - 5};
+        static constexpr int nb_inputs = TreeS::cell_type::storage_type::inputs_size;
+        //
+        using multipole_type = typename TreeS::base_type::cell_type::storage_type::transfer_multipole_type;
+        using dependencies_type = typename TreeS::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
+        using grp_access_type = std::pair<decltype(tree.begin_cells(level)), int>;
+
+        //
+        auto& para = tree.get_parallel_manager();
+        const auto nb_proc = para.get_num_processes();
+        const auto rank = para.get_process_id();
+        if(nb_proc == 1)
+        {
+            return;
+        }
+        //
+        auto size_dep{std::distance(tree.begin_cells(level), tree.begin_mine_cells(level)) +
+                      std::distance(tree.end_mine_cells(level), tree.end_cells(level))};
+        std::vector<dependencies_type> dependencies_in, dependencies_out(size_dep);
+
+        // Build the dependencies in and out
+        {
+            /// We first construct the in dependencies to ensure that multipoles
+            /// are updated by the  previous pass.
+            auto const& morton_to_send = tree.send_morton_indexes(level);
+            // io::print("m2l(task comm) morton_to_sendl=" + std::to_string(level) + "): ", morton_to_send);
+
+            for(auto p = 0; p < nb_proc; ++p)
+            {
+                if(morton_to_send[p].size() > 0)
+                {
+                    parallel::utils::build_multipoles_dependencies_from_morton_vector(
+                      tree.begin_mine_cells(level), tree.end_mine_cells(level), morton_to_send[p], dependencies_in);
+                }
+            }
+            std::sort(std::begin(dependencies_in), std::end(dependencies_in));
+            auto last = std::unique(std::begin(dependencies_in), std::end(dependencies_in));
+            dependencies_in.erase(last, dependencies_in.end());
+            //
+            // out dependencies (on all groups of ghosts)
+            //
+            int idx{0};
+            for(auto it_grp = tree.begin_cells(level); it_grp != tree.begin_mine_cells(level); ++it_grp, ++idx)
+            {
+                dependencies_out[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
+            }
+            for(auto it_grp = tree.end_mine_cells(level); it_grp != tree.end_cells(level); ++it_grp, ++idx)
+            {
+                dependencies_out[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
+            }
+            // io::print("m2l(task comm) dependencies(transfer)(out): ", dependencies_out);
+        }
+        // std::cout << " insert task (comm M2L(level=" + std::to_string(level) + ") \n";
+        // io::print("      out:   ", dependencies_out);
+        // io::print("      in:    ", dependencies_in);
+
+        auto ptr_tree = &tree;
+        auto ptr_treeT = &treeT;
+        // std::cout << "      inout: " << ptr_tree << std::endl;   //to serialise the communication tasks
+
+        // clang-format off
+	#pragma omp task untied default(none) firstprivate(ptr_tree, nb_proc, rank, nb_inputs, level)  shared(std::cerr, std::clog) \
+        depend(iterator(std::size_t it = 0 : dependencies_in.size()), in : (dependencies_in[it])[0])      \
+        depend(iterator(std::size_t it = 0 : dependencies_out.size()), out : (dependencies_out[it])[0]) \
+        depend(inout: ptr_tree[0], ptr_treeT[0] ) priority(prio)
+        // clang-format on
+        {
+            int receive_section{0}, send_section{0};
+            // std::clog << " m2l task(comm(l=" << level << ")) Send part \n" << std::flush;
+            // #ifndef NO_COMM_TASK
+            try
+            {
+                const int tag_level = 611 + level;
+                send_section = 1;
+                auto mpi_multipole_value_type = cpp_tools::parallel_manager::mpi::get_datatype<multipole_type>();
+                auto comm = ptr_tree->get_parallel_manager().get_communicator();
+                // send part
+
+                auto& morton_to_send = ptr_tree->send_morton_indexes(level);
+                // construct cells access (send)
+                //                std::vector<std::vector<grp_access_type>> cells_to_send_access(nb_proc);
+                auto& cells_to_send_access = ptr_tree->send_cells_access(level);
+                auto begin_grp = ptr_tree->begin_mine_cells(level);
+                auto end_grp = ptr_tree->end_mine_cells(level);
+                // std::clog << " m2l task(comm(l=" << level << ")) build direct acces \n" << std::flush;
+                scalfmm::parallel::comm::build_direct_access_to_components(nb_proc, begin_grp, end_grp,
+                                                                           cells_to_send_access, morton_to_send);
+                // build type to send all the multipoles
+                // std::clog << " m2l task(comm(l=" << level << ")) build type \n" << std::flush;
+                auto multipole_type_to_send = scalfmm::parallel::comm::build_mpi_multipoles_type(
+                  cells_to_send_access, nb_inputs, mpi_multipole_value_type);
+                //
+                for(auto p = 0; p < nb_proc; ++p)
+                {
+                    // io::print("morton_to_send", morton_to_send[p]);
+                    if(morton_to_send[p].size() > 0)
+                    {
+                        // std::clog << "m2l task(send(l=" << level << ")) send to " << p << " tag " << tag_level
+                        //           << std::endl
+                        //           << std::flush;
+                        // std::cout << "m2l task(send) ptr_data_type(" << p << ") "   //<< &(multipole_type_to_send[p])
+                        //           << " level: " << level << std::endl
+                        //           << std::flush;
+                        // io::print(std::cout, "morton_to_send(" + std::to_string(p) + ")", std::begin(morton_to_send[p]),
+                        //           std::end(morton_to_send[p]));
+                        // std::cout << std::flush << std::endl;
+                        // #ifdef COMM_SEND
+                        comm.isend(MPI_BOTTOM, 1, multipole_type_to_send[p], p, tag_level);
+                        // #endif
+                    }
+                }
+                // std::clog << "  m2l task(comm(l=" << level << ")) send end \n";
+                ///  end send part
+                ///////////////////////////////////////////////////////
+                receive_section = 1;
+                {
+                    // std::clog << " m2l task(comm(l=" << level << ")) Receive part  level " << level << std::endl;
+                    // receive part
+                    auto& cells_to_receive_access = ptr_tree->receive_cells_access(level);
+                    // for(auto p = 0; p < nb_proc; ++p)
+                    // {
+                    //     auto& access = cells_to_receive_access.at(p);
+                    //     if(access.size())
+                    //     {
+                    //         std::cout << "  cells_to_receive_access " << p << "  size " << access.size() << std::endl
+                    //                   << std::flush;
+                    //         for(auto i = 0; i < access.size(); ++i)
+                    //         {
+                    //             std::cout << i << "  ptr  " << access[i].first->get() << "  index " << access[i].second
+                    //                       << "  morton "
+                    //                       << (*(access[i].first))->component(access[i].second).csymbolics().morton_index
+                    //                       << std::endl;
+                    //         }
+                    //     }
+                    // }
+                    //
+                    auto type = scalfmm::parallel::comm::build_mpi_multipoles_type(cells_to_receive_access, nb_inputs,
+                                                                                   mpi_multipole_value_type);
+
+                    auto ptr_multipole_type_to_receive = ptr_tree->get_receive_multipole_types(level);
+
+                    // post receive
+                    std::vector<cpp_tools::parallel_manager::mpi::request> recept_mpi_status;
+                    for(auto p = 0; p < nb_proc; ++p)
+                    {
+                        if(cells_to_receive_access.at(p).size() != 0)
+                        // if(ptr_multipole_type_to_receive[p] != MPI_DATATYPE_NULL)
+                        {
+                            // std::clog << "m2l task(comm(l=" << level << ")) post ireceive from " << p << " tag "
+                            //           << tag_level << std::endl
+                            //           << std::flush;
+                            recept_mpi_status.push_back(comm.irecv(MPI_BOTTOM, 1, type[p], p, tag_level));
+                        }
+                    }
+
+                    if(recept_mpi_status.size() > 0)
+                    {
+#ifndef SCALFMM_USE_WAIT_ANY
+                        int cpt{0};
+                        while(cpt != recept_mpi_status.size())
+                        {
+                            int count{1};
+                            int index{-1};
+                            MPI_Status status;
+                            MPI_Waitany(int(recept_mpi_status.size()),
+                                        reinterpret_cast<MPI_Request*>(recept_mpi_status.data()), &index, &status);
+                            // index =  cpp_tools::parallel_manager::mpi::request::waitany(recept_mpi_status.size()), recept_mpi_status.data(),status);
+                            ++cpt;
+                            // std::clog << "receive one comm from " << status.MPI_SOURCE << " tag " << status.MPI_TAG
+                            //           << " wait for " << recept_mpi_status.size() - cpt << std::endl;
+
+                            if(status.MPI_TAG != tag_level)
+                            {
+                                std::cerr << "   wrong tag  wait for " << tag_level << " received " << status.MPI_TAG
+                                          << std::endl;
+                                --cpt;
+                            }
+                        }
+#else
+                        // std::clog << " m2l task(comm(l=" << level << ")) wait all level " << level << "  \n"
+                        //           << std::flush;
+                        cpp_tools::parallel_manager::mpi::request::waitall(recept_mpi_status.size(),
+                                                                           recept_mpi_status.data());
+
+#endif
+                        // std::cout << " m2l task(comm) wait all level " << level << "  \n" << std::flush;
+                        // std::clog << " m2l task(comm) wait all level " << level << "  \n" << std::flush;
+
+                        // {
+                        //     std::cout << "M2L  --  level " << level << "   --  " << std::endl;
+                        //     scalfmm::component::for_each_mine_component(
+                        //       ptr_tree->begin_cells(level), ptr_tree->end_cells(level),
+                        //       [](auto const& cell)
+                        //       {
+                        //           std::cout << "M2L task(end receive) cell index " << cell.index() << "  multipoles "
+                        //                     << cell.transfer_multipoles().at(0) << "  locals " << cell.locals().at(0)
+                        //                     << std::endl
+                        //                     << std::flush;
+                        //       });
+                        // }
+                    }
+                    // std::clog << "m2l task(comm(l=" << level << ")) end receive \n";
+                }
+            }
+            catch(std::exception& e)
+            {
+                std::cerr << " error in task_communication !!!!!!!!!\n";
+                std::cerr << " m2l task(comm) crash all level " << level << "  \n" << std::flush;
+                if(receive_section > 0)
+                {
+                    std::cerr << "Bug in receive section \n" << std::flush;
+                }
+                else
+                {
+                    std::cerr << "Bug in send section \n" << std::flush;
+                }
+                std::cerr << e.what() << '\n' << std::flush;
+                std::exit(EXIT_FAILURE);
+            }
+            // #endif
+            // std::clog << "m2l task(comm(l=" << level << ")) end\n";
+        }   // end task
+        // std::cout << "task_communications  fct end\n ";
+    }
 
     /**
      * @brief Perform the communications between the tree_source and the tree_target for the current level
@@ -36,7 +626,6 @@ namespace scalfmm::algorithms::mpi::pass
      *
      * @tparam TreeS
      * @tparam TreeT
-     *
      * @param level  level in the tree
      * @param tree_source  source tree (contains the multipoles)
      * @param tree_target  target tree
@@ -44,16 +633,12 @@ namespace scalfmm::algorithms::mpi::pass
     template<typename TreeS, typename TreeT>
     inline auto start_communications(const int& level, TreeS& tree_source, TreeT& tree_target) -> void
     {
-        using mortonIdx_type = std::int64_t;   // typename TreeT::group_of_cell_type::symbolics_type::morton_type;
-        static constexpr int nb_inputs = TreeS::base_type::cell_type::storage_type::inputs_size;
-        static constexpr int dimension = TreeS::base_type::box_type::dimension;
+        // std::cout << "start_communications  fct start at level " << level << "\n ";
 
-        using value_type_ori = typename TreeS::base_type::cell_type::storage_type::transfer_multipole_type;
-        using value_type1 = value_type_ori;
-        using value_type = std::conditional_t<meta::is_complex_v<value_type_ori>,
-                                              meta::has_value_type_t<value_type_ori>, value_type_ori>;
+        using mortonIdx_type = std::int64_t;   // typename TreeT::group_of_cell_type::symbolics_type::morton_type;
+        // static constexpr int nb_inputs = TreeS::base_type::cell_type::storage_type::inputs_size;
+        // static constexpr int dimension = TreeS::base_type::box_type::dimension;
 
-        int nb_values = meta::is_complex_v<value_type_ori> ? 2 : 1;
         auto& para = tree_target.get_parallel_manager();
         auto comm = para.get_communicator();
 
@@ -69,7 +654,7 @@ namespace scalfmm::algorithms::mpi::pass
         ///////////////////////////////////////////////////////////////////////////////////
         /// STEP 1
         ///////////////////////////////////////////////////////////////////////////////////
-        /// Determines the Morton index vector to be received from processor p? In addition, for each Morton index we
+        /// Determines the Morton index vector to be received from processor p. In addition, for each Morton index we
         /// store the cell, i.e. a pointer to its group and the index within the group (group_ptr, index). This will
         /// enable us to insert the multipoles received from processor p directly into the cell.
         ///
@@ -84,28 +669,52 @@ namespace scalfmm::algorithms::mpi::pass
         using grp_access_type = std::pair<decltype(tree_target.begin_cells(level)), int>;
         std::vector<std::vector<grp_access_type>> to_receive(nb_proc);
         std::vector<std::vector<mortonIdx_type>> morton_to_receive(nb_proc);   // TOREMOVE
-                                                                               // #ifdef SPLIT_COMM
+        bool verbose = false;
+
         {
-            auto begin_left_ghost = tree_target.begin_cells(level);
+            auto begin_left_ghost = tree_source.begin_cells(level);
 
-            auto end_left_ghost = tree_target.begin_mine_cells(level);
-            auto begin_right_ghost = tree_target.end_mine_cells(level);
-            auto end_right_ghost = tree_target.end_cells(level);
+            auto end_left_ghost = tree_source.begin_mine_cells(level);
+            auto begin_right_ghost = tree_source.end_mine_cells(level);
+            auto end_right_ghost = tree_source.end_cells(level);
             auto const& distrib = tree_source.get_cell_distribution(level);
             //
+            // std::clog << " step 1 M2L" << std::endl << std::flush;
+
             scalfmm::parallel::comm::start_step1(comm, begin_left_ghost, end_left_ghost, begin_right_ghost,
                                                  end_right_ghost, distrib, nb_messages_to_receive, nb_messages_to_send,
-                                                 to_receive, morton_to_receive);
+                                                 to_receive, morton_to_receive, verbose);
+            // std::cout << " Value after step 1 M2L" << std::endl << std::flush;
+            // for(auto p = 0; p < nb_proc; ++p)
+            // {
+            //     auto& access = to_receive[p];
+
+            //     std::cout << "  to_receive " << p << "  size " << access.size() << std::endl << std::flush;
+            //     for(auto i = 0; i < access.size(); ++i)
+            //     {
+            //         std::cout << i << "  ptr  " << access[i].first->get() << "  index " << access[i].second
+            //                   << "  morton "
+            //                   << (*(access[i].first))->component(access[i].second).csymbolics().morton_index
+            //                   << std::endl;
+            //     }
+            // }
+            tree_source.receive_cells_access(level) = to_receive;
         }
+        // for(auto p = 0; p < nb_proc; ++p)
+        // {
+        //     io::print("   after_step1 morton to receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
+        // }
+        // io::print("    after_step1 nb_messages_to_receive ", nb_messages_to_receive);
+        // io::print("    after_step1 nb_messages_to_send ", nb_messages_to_send);
 
         ///////////////////////////////////////////////////////////////////////////////////
         /// STEP 2
         ///
         // We can now exchange the morton indices
         // Morton's list of indices to send their multipole to proc p
-        std::vector<std::vector<mortonIdx_type>> morton_to_send(nb_proc);
-        std::vector<cpp_tools::parallel_manager::mpi::request> tab_mpi_status;
-
+        // std::vector<std::vector<mortonIdx_type>> morton_to_send(nb_proc);
+        // std::clog << " step 2 M2L" << std::endl << std::flush;
+        auto& morton_to_send = tree_source.send_morton_indexes(level);
         scalfmm::parallel::comm::start_step2(nb_proc, rank, comm, nb_messages_to_receive, nb_messages_to_send,
                                              morton_to_receive, morton_to_send);
 
@@ -126,207 +735,137 @@ namespace scalfmm::algorithms::mpi::pass
         /// multipoles we're going to put in our ghost cells.
         /////////////////////////////////////////////////////////////////////////////////
         // type of dependence
-        using dep_type = typename TreeS::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
-
-        auto mpi_multipole_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
-        //        std::cout << "\n Start step 3\n\n";
-        auto nb_cells{morton_to_send[0].size()};
-        for(auto const& p: morton_to_send)
         {
-            nb_cells = std::max(nb_cells, p.size());
-        }
-        int order = tree_source.order();
-        //         nb_values = 2 if complex type otherwise 1;
-        //   math::pow(order, dimension) only works with interpolation not generic !!!!
-        int size_mult{int(nb_inputs * math::pow(order, dimension)) * nb_values};   // WRONG !!!!!!
+#ifndef __SPLIT_SEND_RECEIV__
+            task_communications(level, tree_source, tree_target);   // works
+#else
+#warning( "segfault in this section ")
 
-        // allocate the buffer to store the multipoles
-        std::vector<std::vector<value_type_ori>> buffer(nb_proc);
-        {
-            // method to construct the buffer of multipoles to send
-            auto build_buffer = [](auto first_group_ghost, auto last_group_ghost,
-                                   std::vector<mortonIdx_type> const& index_to_send,
-                                   std::vector<value_type_ori>& buffer)
-            {
-                try
-                {
-                    int idx{0};
-                    int max_idx = index_to_send.size();   // loop on the groups
-                    auto it = std::begin(buffer);
-                    for(auto grp_ptr = first_group_ghost; grp_ptr != last_group_ghost; ++grp_ptr)
-                    {
-                        int start_grp{0};
-
-                        auto const& csymb = (*grp_ptr)->csymbolics();
-                        // iterate on the cells
-                        while(idx < max_idx and
-                              math::between(index_to_send[idx], csymb.starting_index, csymb.ending_index))
-                        {   // find cell inside the group
-                            int pos{-1};
-                            for(int i = start_grp; i < (*grp_ptr)->size(); ++i)
-                            {
-                                auto morton = (*grp_ptr)->component(i).csymbolics().morton_index;
-                                if(index_to_send[idx] == morton)
-                                {
-                                    pos = i;
-                                    start_grp = i + 1;
-                                    // std::cout << "   pos = " << pos << std::endl;
-                                    break;
-                                }
-                            }
-                            auto const& cell = (*grp_ptr)->component(pos);
-                            auto const& m = cell.transfer_multipoles();
-                            auto nb_m = m.size();
-                            for(std::size_t i{0}; i < nb_m; ++i)
-                            {
-                                auto const& ten = m.at(i);
-                                std::copy(std::begin(ten), std::end(ten), it);
-                                it += ten.size();
-                            }
-                            ++idx;
-                        }
-                    }
-                }
-                catch(std::out_of_range& e)
-                {
-                    std::cout << " error in buffer building !!!!!!!!!\n";
-                    std::cout << e.what() << '\n' << std::flush;
-                }
-            };
+            std::clog << " step 3 M2L task_send_multipole_at_level" << std::endl << std::flush;
 
-            // vector of dependencies on the group for each MPI process
-            std::vector<std::vector<dep_type>> deps_send(nb_proc);
+            // Construct a task to send the multipole
+            {
+                // loop on the processors to construct the buffer and to send it
+                task_send_multipole_at_level(level, tree_source, morton_to_send);
+            }
 
-            // loop on the processors to construct the buffer and to send it
-            for(auto p = 0; p < nb_proc; ++p)
+            // Construct a task to receive the multipole
             {
-                /// check if I have something to send
-                if(p != rank and morton_to_send[p].size() > 0)
-                {
-#ifdef TRANSFERT_COMM_TASKS
-                    /// We first construct the in dependencies to ensure that multipoles
-                    /// are updated by the  previous pass.
-                    parallel::utils::build_dependencies_from_morton_vector(tree_source.begin_mine_cells(level),
-                                                                           tree_source.end_mine_cells(level),
-                                                                           morton_to_send[p], deps_send[p]);
-                    /// spawn a task for sending communication to process p
-
-#pragma omp task shared(tree_source, morton_to_send, buffer, rank, nb_proc, deps_send, nb_messages_to_receive,         \
-                          size_mult, mpi_multipole_type) firstprivate(p, level)                                        \
-  depend(iterator(std::size_t it = 0 : deps_send[p].size()), in : ((deps_send[p])[it])[0]) priority(prio)
-#endif
-                    {
-                        buffer[p].resize(morton_to_send[p].size() * size_mult);
-                        build_buffer(tree_source.begin_mine_cells(level), tree_source.end_mine_cells(level),
-                                     morton_to_send[p], buffer[p]);
+                std::cout << "   task_receive_multipole_at_level start  " << level << std::endl << std::flush;
 
-                        // send buffer to processor p
-                        comm.isend(reinterpret_cast<value_type*>(buffer[p].data()), buffer[p].size(),
-                                   mpi_multipole_type, p, 611);
-                    }
-                }
-            }
-        }
-        // Reception of the multipoles
-        {
-            // Reset the array of requests used in step 2
-            tab_mpi_status.clear();
-            // We first construct the out dependencies (all the ghost groups)
-#ifdef TRANSFERT_COMM_TASKS
-
-            // compute the dependencies
-            auto size_dep{std::distance(tree_source.begin_cells(level), tree_source.begin_mine_cells(level)) +
-                          std::distance(tree_source.end_mine_cells(level), tree_source.end_cells(level))};
-            std::vector<dep_type> deps_recv(size_dep);
-            {   // Find all dependencies - naive version
-                int idx{0};
-                for(auto it_grp = tree_source.begin_cells(level); it_grp != tree_source.begin_mine_cells(level);
-                    ++it_grp, ++idx)
-                {
-                    deps_recv[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
-                }
-                for(auto it_grp = tree_source.end_mine_cells(level); it_grp != tree_source.end_cells(level);
-                    ++it_grp, ++idx)
-                {
-                    deps_recv[idx] = &(it_grp->get()->ccomponent(0).cmultipoles(0));
-                }
+                task_receive_multipole_at_level(level, tree_source, to_receive);
             }
-
-// post the task on the reception
-#pragma omp task shared(rank, nb_proc, nb_messages_to_receive, size_mult, mpi_multipole_type)                          \
-  depend(iterator(std::size_t it = 0 : deps_recv.size()), out : (deps_recv[it])[0]) priority(prio)
 #endif
-            {
-                // post the receives
-                int cc{0};
-                std::vector<cpp_tools::parallel_manager::mpi::request> recept_mpi_status;
-                std::vector<std::vector<value_type_ori>> buffer_rep(nb_proc);
+        }   // end step3
+        // std::cout << "start_communications  fct end at level " << level << "\n ";
 
-                for(auto p = 0; p < nb_proc; ++p)
-                {
-                    if(p != rank and nb_messages_to_receive[p] != 0)
-                    {
-                        buffer_rep[p].resize(nb_messages_to_receive[p] * size_mult);
+    }   // end function start_communications
 
-                        recept_mpi_status.push_back(
-                          comm.irecv(buffer_rep[p].data(), buffer_rep[p].size(), mpi_multipole_type, p, 611));
-                        ++cc;
-                    }
-                }
+    template<typename TreeS, typename TreeT>
+    inline auto prepare_comm_transfert(const int& level, TreeS& tree_source, TreeT& tree_target) -> void
+    {
+        // std::cout << "start_communications  fct start at level " << level << "\n ";
 
-                // wait we receive all the communication
+        using mortonIdx_type =
+          typename TreeS::morton_type;   // typename TreeT::group_of_cell_type::symbolics_type::morton_type;
+        // static constexpr int nb_inputs = TreeS::base_type::cell_type::storage_type::inputs_size;
+        // static constexpr int dimension = TreeS::base_type::box_type::dimension;
 
-                if(recept_mpi_status.size() > 0)
-                {
-                    cpp_tools::parallel_manager::mpi::request::waitall(recept_mpi_status.size(),
-                                                                       recept_mpi_status.data());
-                }
+        auto& para = tree_source.get_parallel_manager();
+        auto comm = para.get_communicator();
 
-                // put the multipoles inside the ghosts
-                for(auto p = 0; p < nb_proc; ++p)
-                {
-                    if(p != rank and to_receive[p].size() > 0)
-                    {
-                        auto const& buffer = buffer_rep[p];
-                        // ONLY WORKS IF SOURCE == TARGET
-                        auto const& pairs = to_receive[p];
-                        auto it = std::begin(buffer);
+        auto rank = para.get_process_id();
+        auto nb_proc = para.get_num_processes();
+        if(nb_proc == 1)
+        {   // Openmp case -> no communication
+            return;
+        }
+        std::vector<int> nb_messages_to_send(nb_proc, 0);
+        std::vector<int> nb_messages_to_receive(nb_proc, 0);
 
-                        for(auto i = 0; i < int(pairs.size()); ++i)
-                        {
-                            auto& cell = pairs[i].first->get()->component(pairs[i].second);
-                            auto& m = cell.transfer_multipoles();
-                            auto nb_m = m.size();
+        ///////////////////////////////////////////////////////////////////////////////////
+        /// STEP 1
+        ///////////////////////////////////////////////////////////////////////////////////
+        /// Determines the Morton index vector to be received from processor p. In addition, for each Morton index we
+        /// store the cell, i.e. a pointer to its group and the index within the group (group_ptr, index). This will
+        /// enable us to insert the multipoles received from processor p directly into the cell.
+        ///
+        /// to_receive: a vector of vector of pair (the iterator on the group ,the position of the cell in the group)
+        ///  to_receive[p] is the position vector of cells in groups whose Morton index comes from processor p
+        ///  to_receive[p][i] a pair (the iterator on the group ,the position of the cell in the group)
+        /// vector of size nb_proc
+        ///    - nb_messages_to_receive: the number of morton indices to  exchange with processor p
+        ///    - nb_messages_to_send: the number of morton indices to  send tp processor p
+        ///    - morton_to_recv: the morton indices to  exchange with processor p
+        /////////////////////////////////////////////////////////////////////////////////
+        using grp_access_type = std::pair<decltype(tree_target.begin_cells(level)), int>;
+        std::vector<std::vector<grp_access_type>> to_receive(nb_proc);
+        std::vector<std::vector<mortonIdx_type>> morton_to_receive(nb_proc);   // TOREMOVE
+        bool verbose = false;
+
+        {
+            auto begin_left_ghost = tree_source.begin_cells(level);
+
+            auto end_left_ghost = tree_source.begin_mine_cells(level);
+            auto begin_right_ghost = tree_source.end_mine_cells(level);
+            auto end_right_ghost = tree_source.end_cells(level);
+            auto const& distrib = tree_source.get_cell_distribution(level);
+            //
+            // std::clog << " step 1 M2L" << std::endl << std::flush;
+
+            scalfmm::parallel::comm::start_step1(comm, begin_left_ghost, end_left_ghost, begin_right_ghost,
+                                                 end_right_ghost, distrib, nb_messages_to_receive, nb_messages_to_send,
+                                                 to_receive, morton_to_receive, verbose);
+            // std::cout << " Value after step 1 M2L" << std::endl << std::flush;
+            // for(auto p = 0; p < nb_proc; ++p)
+            // {
+            //     auto& access = to_receive[p];
+
+            //     std::cout << "  to_receive " << p << "  size " << access.size() << std::endl << std::flush;
+            //     for(auto i = 0; i < access.size(); ++i)
+            //     {
+            //         std::cout << i << "  ptr  " << access[i].first->get() << "  index " << access[i].second
+            //                   << "  morton "
+            //                   << (*(access[i].first))->component(access[i].second).csymbolics().morton_index
+            //                   << std::endl;
+            //     }
+            // }
+            tree_source.receive_cells_access(level) = to_receive;
+        }
+        // for(auto p = 0; p < nb_proc; ++p)
+        // {
+        //     io::print("   after_step1 morton to receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
+        // }
+        // io::print("    after_step1 nb_messages_to_receive ", nb_messages_to_receive);
+        // io::print("    after_step1 nb_messages_to_send ", nb_messages_to_send);
+
+        ///////////////////////////////////////////////////////////////////////////////////
+        /// STEP 2
+        ///
+        // We can now exchange the morton indices
+        // Morton's list of indices to send their multipole to proc p
+        // std::vector<std::vector<mortonIdx_type>> morton_to_send(nb_proc);
+        // std::clog << " step 2 M2L" << std::endl << std::flush;
+        auto& morton_to_send = tree_source.send_morton_indexes(level);
+        scalfmm::parallel::comm::start_step2(nb_proc, rank, comm, nb_messages_to_receive, nb_messages_to_send,
+                                             morton_to_receive, morton_to_send);
+    }
 
-                            for(std::size_t i{0}; i < nb_m; ++i)
-                            {
-                                auto& ten = m.at(i);
-                                std::copy(it, it + ten.size(), std::begin(ten));
-                                it += ten.size();
-                            }
-                        }
-                    }
-                }
-            }   // end task
-        }   // end step3
-        para.get_communicator().barrier();
-    }   // end function start_communications
     ///////////////////////////////////////////////////////////////////////////////////
 
-    /**
-    * @brief apply the transfer operator to construct the local approximation in tree_target
-    *
-    * @tparam TreeS template for the Tree source type
-    * @tparam TreeT template for the Tree target type
-    * @tparam FarField template for the far field type
-    * @tparam BufferPtr template for the type of pointer of the buffer
-    *
-    * @param tree_source the tree containing the source cells/leaves
-    * @param tree_target the tree containing the target cells/leaves
-    * @param far_field The far field operator
-    * @param buffers vector of buffers used by the far_field in the transfer pass (if needed)
-    * @param split the enum  (@see split_m2l) tp specify on which level we apply the transfer operator
-    */
+    /// @brief apply the transfer operator to construct the local approximation in tree_target
+    ///
+    /// @tparam TreeS template for the Tree source type
+    /// @tparam TreeT template for the Tree target type
+    /// @tparam FarField template for the far field type
+    /// @tparam BufferPtr template for the type of pointer of the buffer
+    /// @param tree_source the tree containing the source cells/leaves
+    /// @param tree_target the tree containing the target cells/leaves
+    /// @param far_field The far field operator
+    /// @param buffers vector of buffers used by the far_field in the transfer pass (if needed)
+    /// @param split the enum  (@see split_m2l) tp specify on which level we apply the transfer
+    /// operator
+    ///
     template<typename TreeS, typename TreeT, typename FarField, typename BufferPtr>
     inline auto transfer(TreeS& tree_source, TreeT& tree_target, FarField const& far_field,
                          std::vector<BufferPtr> const& buffers,
@@ -352,11 +891,23 @@ namespace scalfmm::algorithms::mpi::pass
         case omp::pass::split_m2l::full:
             break;
         }
+        // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
+
+        // std::cout << "  loop start at top_height " << top_height << " and end at last_level " << last_level << std::endl
+        //           << std::flush;
         for(std::size_t level = top_height; level < last_level; ++level)
         {
+            // std::cout << "transfer  : " << level << std::endl << std::flush;
             start_communications(level, tree_source, tree_target);
+            // std::cout << "   end comm  " << level << std::endl << std::flush;
+            // #pragma omp taskwait
+            // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
+
             omp::pass::transfer_level(level, tree_source, tree_target, far_field, buffers);
+            //             std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
         }
+        // std::cout << "end transfer pass" << std::endl << std::flush;
+        // std::cout << "&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&" << std::endl;
     }
 
 }   // namespace scalfmm::algorithms::mpi::pass
diff --git a/include/scalfmm/algorithms/mpi/upward.hpp b/include/scalfmm/algorithms/mpi/upward.hpp
index 463e5814c85a2db8e0faba004aae68422bd08a17..53d435ce963d15b9c456ef89f1678cb2f0e1c16e 100644
--- a/include/scalfmm/algorithms/mpi/upward.hpp
+++ b/include/scalfmm/algorithms/mpi/upward.hpp
@@ -1,26 +1,65 @@
 // --------------------------------
 // See LICENCE file at project root
-// File : scalfmm/algorithms/mpi/upward.hpp
+// File : algorithm/omp/upward.hpp
 // --------------------------------
 #ifndef SCALFMM_ALGORITHMS_MPI_UPWARD_HPP
 #define SCALFMM_ALGORITHMS_MPI_UPWARD_HPP
 
 #ifdef _OPENMP
 
+#include <omp.h>
+
 #include "scalfmm/algorithms/omp/upward.hpp"
+// #include "scalfmm/operators/m2m.hpp"
+// #include "scalfmm/operators/tags.hpp"
+// #include "scalfmm/tree/utils.hpp"
+// #include "scalfmm/utils/massert.hpp"
+// #include "scalfmm/utils/math.hpp"
 
 #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 
-#include <omp.h>
-
 namespace scalfmm::algorithms::mpi::pass
 {
+    /// @brief Construct the vector of dependencies (child group)
+    /// @tparam IteratorType
+    /// @tparam MortonType
+    /// @tparam Dependencies_t
+    /// @tparam dimension
+    /// @param begin first iterator on the groups of cells (child)
+    /// @param end  last iterator on the groups of cells (child)
+    /// @param parent_morton_index the parent index
+    /// @param dependencies  the vector of dependencies
+    template<int dimension, typename IteratorType, typename MortonType, typename Dependencies_t>
+    void build_dependencies(IteratorType begin, IteratorType end, MortonType const& parent_morton_index,
+                            Dependencies_t& dependencies)
+    {
+        for(auto grp_ptr = begin; grp_ptr != end; ++grp_ptr)
+        {
+            auto const& csymb = (*grp_ptr)->csymbolics();
+            // iterate on the cells in the same group
+            // we move forward in the index vector
+            if(parent_morton_index == (csymb.starting_index >> dimension))
+            {
+                // std::cout << " add depend for grp with Int [" << csymb.starting_index << ", " << csymb.ending_index
+                //           << "]" << std::endl;
+                dependencies.push_back(&(grp_ptr->get()->ccomponent(0).cmultipoles(0)));
+            }
+            else
+            {
+                break;
+            }
+        }
+    }
     /**
-     * @brief Perform the communications for the children level
+     * @brief Perform the communications for the children level 
      *
+     *  Check if the parent of my first Morton index at child level is own by the 
+     * previous process. if yes, we send the multipoles 
+     * 
      * @tparam Tree
-     *
      * @tparam Approximation
+     * @input[in] level the level to build the multipoles
+     * @input[inout] tree to update the multipoles
      */
     template<typename Tree>
     inline auto start_communications(const int& level, Tree& tree) -> void
@@ -36,80 +75,103 @@ namespace scalfmm::algorithms::mpi::pass
         static constexpr auto prio{omp::priorities::max};
         //
         auto& para = tree.get_parallel_manager();
-        auto comm = para.get_communicator();
+        auto& comm = para.get_communicator();
+        auto ptr_comm = &comm;
         auto rank = comm.rank();
         int nb_proc = comm.size();
         int tag_nb = 1200 + 10 * level;
         int tag_data = 1201 + 10 * level;
         auto mpi_int_type = cpp_tools::parallel_manager::mpi::get_datatype<int>();
-
+        // level where the multipoles are known
         auto level_child = level + 1;
+        // iterator on the child cell groups
+        auto it_first_group_of_child = tree.begin_mine_cells(level_child);
         // get size of multipoles
-        auto it_group1 = tree.begin_mine_cells(level_child);
-        auto it_cell1 = it_group1->get()->begin();
-        auto const& m1 = it_cell1->cmultipoles();
-        int size{int(nb_inputs * m1.at(0).size()) * nb_children};
+        auto it_first_cell_child = it_first_group_of_child->get()->begin();
+        auto const& m1 = it_first_cell_child->cmultipoles();
+        int size{int(nb_inputs * m1.at(0).size()) * nb_children};   //only nb_children -1 is needed in the worse case
+
+        std::vector<dep_type> dependencies_out, dependencies_in;
 
-        std::vector<dep_type> deps;
+        auto const& distrib = tree.get_cell_distribution(level);
+        auto const& distrib_child = tree.get_cell_distribution(level_child);
+        bool send_data{false}, receive_data{false};
+
+        auto ptr_tree = &tree;
 
         if(rank > 0)
         {
             // If we send the multipoles, they must have been updated by the M2M of the previous level!
-
-            int count{0};
+            // Check if you have to send something on the left (rank-1)
+            // We are at level l and the children are at level l+1.
+            // The ghost at the child level(on the right)
+            //   must be updated if the parent index of the first child index is not equal to(lower than)
+            //     the first index at level l.
             // serialization
-            std::vector<value_type> buffer(size);
-            // check if we have to send some children
-            auto first_group_of_child = tree.begin_mine_cells(level_child)->get();
-            auto last_index_child = first_group_of_child->component(0).index();
+            // check if we have some child to send (if there exists they starts at the first group)
+
+            // first_index_child = morton index of the first child cell (at level = level_child)
+            auto first_index_child = distrib_child[rank][0];
+            // index_parent = first index of the cell at current level  on my process
+            auto index_parent = distrib[rank][0];
+            // parent_morton_index = last index of the cell on previous process at current level
+            auto previous_parent_morton_index = distrib[rank - 1][1] - 1;
+            // first_index_child = morton index of the first child cell (at level = level_child)
+            auto last_index_child_previous = distrib_child[rank - 1][1] - 1;
             //
-            auto first_group = tree.begin_mine_cells(level)->get();
-            auto index = first_group->component(0).index();
-            // Check if I have the parent
-
-            // Should be in a task !
-            auto it_group = tree.begin_mine_cells(level_child);
-            auto gs = it_group->get()->size();
-            int nb_grp_dep = std::min(static_cast<int>(nb_children / gs + 1),
-                                      static_cast<int>(std::distance(it_group, tree.end_mine_cells(level_child))));
-            auto it_grp = it_group;
-#ifdef M2M_COMM_TASKS
-            for(int i = 0; i < nb_grp_dep; ++i, ++it_grp)
-            {
-                deps.push_back(&(it_grp->get()->ccomponent(0).cmultipoles(0)));
-            }
-#pragma omp task shared(rank, mpi_int_type, size, tag_data, tag_nb, dimension) firstprivate(it_group)                  \
-  depend(iterator(std::size_t it = 0 : deps.size()), out : (deps[it])[0]) priority(prio)
-#endif
+            send_data = (first_index_child >> dimension) == previous_parent_morton_index;
+            if(send_data)
             {
-                if(index > last_index_child >> dimension)
+                // std::cout << "upward send comm  first_index_child " << first_index_child << " parent "
+                //           << (first_index_child >> dimension) << " previous child " << last_index_child_previous
+                //           << " its parent " << previous_parent_morton_index << std::endl
+                //           << std::flush;
+                // construct the dependencies on the group of multipoles
+                build_dependencies<dimension>(it_first_group_of_child, tree.end_mine_cells(level_child),
+                                              previous_parent_morton_index, dependencies_in);
+// io::print("upward(send) dependencies(in) ", dependencies_in);
+// std::cout << std::flush;
+// task to send the multipoles
+#pragma omp task default(none) shared(std::cout, std::clog)                                                            \
+  firstprivate(ptr_comm, rank, level, tag_data, tag_nb, mpi_int_type, size, dimension, nb_children,                    \
+                 previous_parent_morton_index, first_index_child, it_first_group_of_child)                             \
+  depend(iterator(std::size_t it = 0 : dependencies_in.size()), in : (dependencies_in[it])[0]),                        \
+  depend(inout : ptr_tree[0]) priority(prio)
                 {
+                    // std::clog << "upward(task send(" << level << ")) start \n";
+                    std::vector<value_type> buffer(size);
+                    int count{0};
+
+                    // std::clog << "upward(task send(" << level << ")) index " << first_index_child
+                    //           << "  parent_of_first_index_child " << previous_parent_morton_index << std::endl
+                    //           << std::flush;
                     // index is now the parent of the first child
-                    index = last_index_child >> dimension;
+                    auto index = first_index_child >> dimension;
                     // I have to send
                     // find the number of children to send (get pointer on multipoles !!)
                     // Construct an MPI datatype
 
                     // serialization
-
-                    // auto it_group = tree.begin_mine_cells(level_child);
-                    auto it_cell = it_group->get()->begin();
+                    auto it_group = it_first_group_of_child;
+                    //
+                    auto it_cell = it_first_group_of_child->get()->begin();
                     auto next = scalfmm::component::generate_linear_iterator(1, it_group, it_cell);
 
-                    //  We construct an MPI DATA_TYPE
-                    // MPI_Datatype mult_data_type;
                     auto it = std::begin(buffer);
+                    //
+                    // compute the number of cells to send and copy the multipoles in the buffer
                     for(int i = 0; i < nb_children - 1; ++i, next())
                     {
-                        if(index < (it_cell->index() >> dimension))
+                        if(previous_parent_morton_index < (it_cell->index() >> dimension))
                         {
                             break;
                         }
+                        // std::clog << "upward(task send) Check children  P " << index << " C " << it_cell->index()
+                        //           << " level " << it_cell->csymbolics().level << std::endl
+                        //           << std::flush;
                         // copy the multipoles in the buffer
                         auto const& m = it_cell->cmultipoles();
-
                         auto nb_m = m.size();
-
                         for(std::size_t i{0}; i < nb_m; ++i)
                         {
                             auto const& ten = m.at(i);
@@ -118,88 +180,249 @@ namespace scalfmm::algorithms::mpi::pass
                         }
                         ++count;
                     }
-                }
 
-                comm.isend(&count, 1, mpi_int_type, rank - 1, tag_nb);
+                    // std::clog << "upward(task send) nb_send = " << count << std::endl;
 
-                if(count != 0)
-                {
-                    // loop to serialize the multipoles
-                    auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+                    ptr_comm->isend(&count, 1, mpi_int_type, rank - 1, tag_nb);
 
-                    comm.isend(buffer.data(), size, mpi_type, rank - 1, tag_data);
-                }
+                    {
+                        // loop to serialize the multipoles
+                        auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+
+                        ptr_comm->isend(buffer.data(), size, mpi_type, rank - 1, tag_data);
+                        // std::cout << "upward(task send) buffer:";
+                        // for(int i = 0; i < buffer.size(); ++i)
+                        // {
+                        //     std::cout << "  " << buffer[i];
+                        // }
+                        // std::cout << " \n" << std::flush;
+                    }
+                    // std::clog << "upward(task send(" << level << ")) end \n";
+                }   // end task
             }
+            // else
+            // {
+            //     std::cout << "upward no send comm first_index_child " << first_index_child << " previous child "
+            //               << last_index_child_previous << std::endl
+            //               << std::flush;
+            // }
         }
-        if(rank == nb_proc - 1)
+
+        if(rank < nb_proc - 1)
         {
-            return;
+            // last_index_child = morton index of the last child cell (at level = level_child)
+            auto last_index_child = distrib_child[rank][1] - 1;
+            // parent_morton_index = last index of the cell on previous process at current level
+            auto parent_morton_index = distrib[rank][1] - 1;
+            // first_index_child_next = morton index of the first child cell (at level = level_child) on next mpi process
+            auto first_index_child_next = distrib_child[rank + 1][0];
+            receive_data = (last_index_child >> dimension) == (first_index_child_next >> dimension);
+            if(receive_data)
+            {
+                // std::cout << "upward receive comm  last_index_child " << last_index_child << " parent "
+                //           << (last_index_child >> dimension) << " next child " << first_index_child_next
+                //           << " its parent " << (first_index_child_next >> dimension) << std::endl
+                //           << std::flush;
+                // dependencies_out contains the ghosts group with parent morton index
+                auto it_first_ghost = tree.end_mine_cells(level_child);
+                build_dependencies<dimension>(it_first_ghost, tree.end_cells(level_child), parent_morton_index,
+                                              dependencies_out);
+                // io::print(std::clog, "upward(receive) dependencies(out) ", dependencies_out);
+                // io::print(std::cout, "upward(receive) dependencies(out) ", dependencies_out);
+                // std::clog << std::flush;
+// dependencies_out
+#pragma omp task default(none) shared(std::cout, std::clog)                                                            \
+  firstprivate(ptr_comm, rank, level, tag_data, tag_nb, mpi_int_type, size, dimension, it_first_ghost)                 \
+  depend(iterator(std::size_t it = 0 : dependencies_out.size()), out : (dependencies_out[it])[0]),                     \
+  depend(inout : ptr_tree[0]) priority(prio)
+                {
+                    // std::clog << "upward(task receive(" << level << ")) start \n";
+                    int count{0};
+                    std::vector<value_type> buffer(size);
+                    ptr_comm->recv(&count, 1, mpi_int_type, rank + 1, tag_nb);
+                    // std::clog << "upward(task receive(" << level << "))  " << count << " cell(s)\n" << std::flush;
+
+                    auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
+
+                    cpp_tools::parallel_manager::mpi::request recept_mpi_status(
+                      ptr_comm->irecv(buffer.data(), size, mpi_type, rank + 1, tag_data));
+                    cpp_tools::parallel_manager::mpi::request::waitall(1, &recept_mpi_status);
+
+                    // ptr_comm->recv(buffer.data(), size, mpi_type, rank + 1, tag_data);
+
+                    // std::cout << "upward(task receive)     buffer:";
+                    // for(int i = 0; i < buffer.size(); ++i)
+                    // {
+                    //     std::cout << "  " << buffer[i];
+                    // }
+                    // std::cout << std::endl << std::flush;
+                    // first right ghost
+                    auto it_group = it_first_ghost;
+                    auto it_cell = it_group->get()->begin();
+                    // linear iterator on cells
+                    auto next1 = scalfmm::component::generate_linear_iterator(1, it_group, it_cell);
+                    auto it = std::begin(buffer);
+
+                    for(int i = 0; i < count; ++i, next1())
+                    {
+                        // copy the multipoles in the buffer
+                        auto& m = it_cell->multipoles();
+                        auto nb_m = m.size();
+                        // std::clog << "upward((" << level << ")) cell index: " << it_cell->index() << " level "
+                        //           << it_cell->csymbolics().level << "\n"
+                        //           << std::flush;
+                        for(std::size_t i{0}; i < nb_m; ++i)
+                        {
+                            auto& ten = m.at(i);
+                            // std::cout << "upward()    ten(before) " << ten << std::endl << std::flush;
+                            std::copy(it, it + ten.size(), std::begin(ten));
+                            it += ten.size();
+                            // std::cout << "upward()    ten(after) " << ten << std::endl << std::flush;
+                        }
+                    }
+                    // std::clog << "upward(task receive(" << level << ")) end \n";
+                }   // end task
+            }
+            // else
+            // {
+            //     std::cout << "no receive comm last_index_child " << last_index_child << " next child "
+            //               << first_index_child_next << std::endl
+            //               << std::flush;
+            // }
         }
-        // Add task dependencies
+    }
+
+    template<typename Tree>
+    void prepare_comm_up(Tree tree, const int level)
+    {
+        using value_type = typename Tree::base_type::cell_type::value_type;
+        using dep_type = typename Tree::group_of_cell_type::symbolics_type::ptr_multi_dependency_type;
+
+        static constexpr std::size_t dimension = Tree::base_type::box_type::dimension;
+        static constexpr int nb_inputs = Tree::cell_type::storage_type::inputs_size;
         //
-        // We receive the cells (at most 2^d - 1) to have all the children of
-        //  the last father cell I own. These cells go into the first phantom group on the right.
-        // dep(out) these cells
-        // dep(out) group_parent_dep[0]??
-
-        auto it_group = tree.end_mine_cells(level_child);
-        auto gs = it_group->get()->size();
-        int nb_grp_dep = std::min(static_cast<int>(nb_children / gs + 1),
-                                  static_cast<int>(std::distance(it_group, tree.end_cells(level_child))));
-        auto it_group_parent = --(tree.end_mine_cells(level));
-        auto it_grp = it_group;
-        for(int i = 0; i < nb_grp_dep; ++i, ++it_grp)
-        {
-            deps.push_back(&(it_grp->get()->ccomponent(0).cmultipoles(0)));
-        }
+        // number of theoretical children
+        constexpr int nb_children = math::pow(2, dimension);
+        static constexpr auto prio{omp::priorities::max};
+        //
+        auto& para = tree.get_parallel_manager();
+        auto& comm = para.get_communicator();
+        auto ptr_comm = &comm;
+        auto rank = comm.rank();
+        int nb_proc = comm.size();
+        int tag_nb = 1200 + 10 * level;
+        int tag_data = 1201 + 10 * level;
+        auto mpi_int_type = cpp_tools::parallel_manager::mpi::get_datatype<int>();
+        // level where the multipoles are known
+        auto level_child = level + 1;
+        // iterator on the child cell groups
+        auto it_first_group_of_child = tree.begin_mine_cells(level_child);
+        // get size of multipoles
+        auto it_first_cell_child = it_first_group_of_child->get()->begin();
+        auto const& m1 = it_first_cell_child->cmultipoles();
+        int size{int(nb_inputs * m1.at(0).size()) * nb_children};   //only nb_children -1 is needed in the worse case
+
+        auto msg = tree.get_up_down_access(level);
 
-        auto group_parent_dep = it_group_parent->get()->ccomponent(0).cmultipoles(0);
-#ifdef M2M_COMM_TASKS
-#pragma omp task shared(rank, mpi_int_type, size, tag_data, tag_nb)                                                    \
-  depend(iterator(std::size_t it = 0 : deps.size()), out : (deps[it])[0]) priority(prio)
-#endif
+        auto const& distrib = tree.get_cell_distribution(level);
+        auto const& distrib_child = tree.get_cell_distribution(level_child);
+        bool send_data{false}, receive_data{false};
+
+        auto ptr_tree = &tree;
+
+        if(rank > 0)
         {
-            int count{-1};
-            comm.recv(&count, 1, mpi_int_type, rank + 1, tag_nb);
-            // use a recv
-            if(count > 0)
-            {
-                std::vector<value_type> buffer(size);
-                auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<value_type>();
-                comm.recv(buffer.data(), size, mpi_type, rank + 1, tag_data);
+            // If we send the multipoles, they must have been updated by the M2M of the previous level!
+            // Check if you have to send something on the left (rank-1)
+            // We are at level l and the children are at level l+1.
+            // The ghost at the child level(on the right)
+            //   must be updated if the parent index of the first child index is not equal to(lower than)
+            //     the first index at level l.
+            // serialization
+            // check if we have some child to send (if there exists they starts at the first group)
 
-                std::cout << std::endl;
-                auto it_group = tree.end_mine_cells(level_child);
-                auto it_cell = it_group->get()->begin();
-                auto next1 = scalfmm::component::generate_linear_iterator(1, it_group, it_cell);
-                auto it = std::begin(buffer);
+            // first_index_child = morton index of the first child cell (at level = level_child)
+            auto first_index_child = distrib_child[rank][0];
+            // index_parent = first index of the cell at current level  on my process
+            auto index_parent = distrib[rank][0];
+            // parent_morton_index = last index of the cell on previous process at current level
+            auto previous_parent_morton_index = distrib[rank - 1][1] - 1;
+            // first_index_child = morton index of the first child cell (at level = level_child)
+            auto last_index_child_previous = distrib_child[rank - 1][1] - 1;
+            //
+            send_data = (first_index_child >> dimension) == previous_parent_morton_index;
 
-                for(int i = 0; i < count; ++i, next1())
-                {
-                    // copy the multipoles in the buffer
-                    auto& m = it_cell->multipoles();
+            if(send_data)
+            {
+                std::clog << "upward(task send(" << level << ")) start \n";
+                int count{0};
+                std::clog << "upward(task send(" << level << ")) index " << first_index_child
+                          << "  parent_of_first_index_child " << previous_parent_morton_index << std::endl
+                          << std::flush;
+                // index is now the parent of the first child
+                auto index = first_index_child >> dimension;
+                // I have to send
+                // find the number of children to send (get pointer on multipoles !!)
+                // Construct an MPI datatype
 
-                    auto nb_m = m.size();
-                    for(std::size_t i{0}; i < nb_m; ++i)
+                // serialization
+                auto it_group = it_first_group_of_child;
+                //
+                auto it_cell = it_first_group_of_child->get()->begin();
+                auto next = scalfmm::component::generate_linear_iterator(1, it_group, it_cell);
+                //
+                // compute the number of cells to send and copy the multipoles in the buffer
+                for(int i = 0; i < nb_children - 1; ++i, next())
+                {
+                    if(previous_parent_morton_index < (it_cell->index() >> dimension))
                     {
-                        auto& ten = m.at(i);
-                        std::copy(it, it + ten.size(), std::begin(ten));
-                        it += ten.size();
+                        break;
                     }
+                    std::clog << "upward(task send) Check children  P " << index << " C " << it_cell->index()
+                              << " level " << it_cell->csymbolics().level << std::endl
+                              << std::flush;
+                    ++count;
                 }
+                std::clog << "upward(task send) nb_send = " << count << std::endl;
+                ptr_comm->isend(&count, 1, mpi_int_type, rank - 1, tag_nb);
+
+                msg.set_nb_child_to_send(count);
             }
         }
-    }
+        if(rank < nb_proc - 1)
+        {
+            // last_index_child = morton index of the last child cell (at level = level_child)
+            auto last_index_child = distrib_child[rank][1] - 1;
+            // parent_morton_index = last index of the cell on previous process at current level
+            auto parent_morton_index = distrib[rank][1] - 1;
+            // first_index_child_next = morton index of the first child cell (at level = level_child) on next mpi process
+            auto first_index_child_next = distrib_child[rank + 1][0];
+            receive_data = (last_index_child >> dimension) == (first_index_child_next >> dimension);
+            if(receive_data)
+            {
+                // std::cout << "upward receive comm  last_index_child " << last_index_child << " parent "
+                //           << (last_index_child >> dimension) << " next child " << first_index_child_next
+                //           << " its parent " << (first_index_child_next >> dimension) << std::endl
+                //           << std::flush;
 
-    /**
-    * @brief This function constructs the local approximation for all the cells of the tree by applying the operator m2m.
-    *
-    * @tparam Tree
-    * @tparam Approximation
-    *
-    * @param tree the tree target
-    * @param approximation the approximation to construct the local approximation
-    */
+                std::clog << std::flush;
+                {
+                    std::clog << "upward(task receive(" << level << ")) start \n";
+                    int count{0};
+                    std::vector<value_type> buffer(size);
+                    ptr_comm->recv(&count, 1, mpi_int_type, rank + 1, tag_nb);
+                    std::clog << "upward(task receive(" << level << "))  " << count << " cell(s)\n" << std::flush;
+                    msg.set_nb_child_to_receive(count);
+                }
+            }
+        }
+    };
+    /// @brief This function constructs the local approximation for all the cells of the tree by applying the operator
+    /// m2m
+    ///
+    /// @param tree   the tree target
+    /// @param approximation the approximation to construct the local approximation
+    ///
     template<typename Tree, typename Approximation>
     inline auto upward(Tree& tree, Approximation const& approximation) -> void
     {
@@ -207,14 +430,29 @@ namespace scalfmm::algorithms::mpi::pass
 
         // upper working level is
         const int top_height = tree.box().is_periodic() ? 0 : 2;
-
+        // const int start_duplicated_level = tree.start_duplicated_level();
+        //
+        // int top = start_duplicated_level < 0 ? top_height : start_duplicated_level - 1;
         int top = top_height;
         for(int level = leaf_level - 1; level >= top /*top_height*/; --level)   // int because top_height could be 0
         {
+            // std::cout << "M2M : " << level + 1 << " -> " << level << std::endl << std::flush;
+            //
             start_communications(level, tree);
+            // std::cout << "  end comm " << std::endl << std::flush;
 
             omp::pass::upward_level(level, tree, approximation);
+            // std::cout << "  end upward_level " << level << std::endl << std::flush;
         }
+        // std::cout << "end upward " << std::endl << std::flush;
+
+        //
+
+        // for(int level = start_duplicated_level; level >= top_height; --level)   // int because top_height could be 0
+        // {
+        //     std::cout << "Level duplicated (seq): " << level << std::endl;
+        //     upward_level(level, tree, approximation);
+        // }
     }
 }   // namespace scalfmm::algorithms::mpi::pass
 
diff --git a/include/scalfmm/algorithms/omp/direct.hpp b/include/scalfmm/algorithms/omp/direct.hpp
index e11a58abf57208db8453b15af27bbe5d32c45dd3..07afa569c2572dbd8dd85b6648e210baed516bd5 100644
--- a/include/scalfmm/algorithms/omp/direct.hpp
+++ b/include/scalfmm/algorithms/omp/direct.hpp
@@ -52,8 +52,11 @@ namespace scalfmm::algorithms::omp::pass
 
         const auto& matrix_kernel = nearfield.matrix_kernel();
         // loop on the groups
-        auto begin_groups{std::get<0>(begin)};
-        const auto end_groups{std::get<0>(end)};
+        // auto begin_groups{std::get<0>(begin)};
+        // const auto end_groups{std::get<0>(end)};
+        auto begin_groups{tree.begin_mine_leaves()};
+        auto end_groups{tree.end_leaves()};
+        // auto end_groups{tree.end_mine_leaves()};
         const auto prio_big{priorities::p2p_big};
         const auto prio_small{priorities::p2p_small};
         if(mutual)
@@ -94,7 +97,9 @@ namespace scalfmm::algorithms::omp::pass
             }
         }
         //
-        begin_groups = std::get<0>(begin);
+        // begin_groups = std::get<0>(begin);
+        begin_groups = tree.begin_mine_leaves();
+        end_groups = tree.end_mine_leaves();
         while(begin_groups != end_groups)
         {
             const auto current_group_ptr_particles = (*begin_groups).get()->depends_update();
diff --git a/include/scalfmm/algorithms/omp/downward.hpp b/include/scalfmm/algorithms/omp/downward.hpp
index 759a0c9b9f2527f07cadf1cdb753060614bda472..236647851675b92436bdb231203a22d1deb8f97d 100644
--- a/include/scalfmm/algorithms/omp/downward.hpp
+++ b/include/scalfmm/algorithms/omp/downward.hpp
@@ -33,6 +33,7 @@ namespace scalfmm::algorithms::omp::pass
         using interpolator_type = typename std::decay_t<ApproximationType>;
         using value_type = typename interpolator_type::value_type;
 
+        static constexpr auto prio{priorities::l2l};
         static constexpr auto dimension = interpolator_type::dimension;
 
         // Get the index of the corresponding child-parent interpolator
@@ -40,25 +41,28 @@ namespace scalfmm::algorithms::omp::pass
           (approximation.cell_width_extension() < std::numeric_limits<value_type>::epsilon()) ? 2 : level;
         // // parent level
         // iterator on the groups of cells (current level)
-        auto group_of_cell_begin = tree.begin_mine_cells(level);
+        auto group_of_cell_begin = tree.begin_cells(level);
         auto group_of_cell_end = tree.end_mine_cells(level);   //
         // iterator on the groups of cells (child level)
-        auto group_of_child_cell_begin = tree.begin_cells(level + 1);
-        auto group_of_child_cell_end = tree.end_cells(level + 1);
+        auto group_of_child_cell_begin = tree.begin_mine_cells(level + 1);
+        auto group_of_child_cell_end = tree.end_mine_cells(level + 1);
 
         auto start_range_dependencies{group_of_cell_begin};
         auto end_range_dependencies{group_of_cell_begin};
 
-        static constexpr auto prio{priorities::l2l};
-
+        // We iterate on the group of child cells
         while(group_of_child_cell_begin != group_of_child_cell_end)
         {
             using ptr_parent_groups_type = std::decay_t<decltype(group_of_cell_begin->get())>;
 
-            auto group_child = group_of_child_cell_begin->get();
-            auto group_child_raw = &group_child->ccomponent(0).clocals(0);
-            auto child_group_starting_morton_index = group_child->csymbolics().starting_index;
-            auto child_group_ending_morton_index = group_child->csymbolics().ending_index;
+            auto ptr_group_child = group_of_child_cell_begin->get();
+            // For the dependencies
+            auto group_child_raw = &ptr_group_child->ccomponent(0).clocals(0);
+            auto child_group_starting_morton_index = ptr_group_child->csymbolics().starting_index;
+            auto child_group_ending_morton_index = ptr_group_child->csymbolics().ending_index;
+            // std::cout << " group cells  in [" << child_group_starting_morton_index << ", "
+            //           << child_group_ending_morton_index << "[" << std::endl;
+            //
             auto parent_starting_morton_index = child_group_starting_morton_index >> dimension;
             auto parent_ending_morton_index = ((child_group_ending_morton_index - 1) >> dimension) + 1;
 
@@ -66,39 +70,47 @@ namespace scalfmm::algorithms::omp::pass
 
             std::tie(start_range_dependencies, end_range_dependencies) = index::get_parent_group_range(
               parent_starting_morton_index, parent_ending_morton_index, start_range_dependencies, group_of_cell_end);
+
             auto start_range_dependencies_tmp{start_range_dependencies};
             const auto end_range_dependencies_tmp{end_range_dependencies};
 
             while(start_range_dependencies != end_range_dependencies)
             {
+                // std::cout << "     --> parent cells  in [" << (*start_range_dependencies)->csymbolics().starting_index
+                //           << ", " << (*start_range_dependencies)->csymbolics().ending_index << "[" << std::endl;
                 parent_dependencies.push_back(&(*start_range_dependencies)->ccomponent(0).clocals(0));
-                // parent_groups.push_back(start_range_dependencies->get());
                 ++start_range_dependencies;
             }
 
             start_range_dependencies = --end_range_dependencies;
             // clang-format off
-#pragma omp task untied default(none) firstprivate(group_child, start_range_dependencies_tmp, end_range_dependencies_tmp, level_interpolator_index)                                         \
-  shared(approximation)                                                                      \
-    depend(iterator(it = 0 : std::size(group_child->csymbolics().group_dependencies_l2l_in)),                          \
-           in : (group_child->csymbolics().group_dependencies_l2l_in.at(it))[0]) depend(inout  : group_child_raw[0])   \
+#pragma omp task untied default(none) firstprivate(ptr_group_child, start_range_dependencies_tmp, end_range_dependencies_tmp, level_interpolator_index,group_child_raw) \
+    shared(approximation, std::cout)                \
+    depend(iterator(it = 0 : std::size(ptr_group_child->csymbolics().group_dependencies_l2l_in)),                          \
+           in : (ptr_group_child->csymbolics().group_dependencies_l2l_in.at(it))[0]) depend(inout  : group_child_raw[0])   \
     priority(prio)
             // clang-format on
             {   // Can be a task(in:iterParticles, out:iterChildCells ...)
+                // io::print("l2l(task) dependencies(transfer)(in): ",
+                //           ptr_group_child->csymbolics().group_dependencies_l2l_in);
+                // std::cout << "l2l(task) run  task dep(inout) on group group_child_raw  " << group_child_raw << std::endl
+                //           << std::flush;
                 std::vector<ptr_parent_groups_type> parent_groups;
                 while(start_range_dependencies_tmp != end_range_dependencies_tmp)
                 {
                     parent_groups.push_back(start_range_dependencies_tmp->get());
                     ++start_range_dependencies_tmp;
                 }
-                for(std::size_t cell_index = 0; cell_index < group_child->csymbolics().number_of_component_in_group;
+                for(std::size_t cell_index = 0; cell_index < ptr_group_child->csymbolics().number_of_component_in_group;
                     ++cell_index)
                 {
-                    auto& child_cell = group_child->component(cell_index);
+                    auto& child_cell = ptr_group_child->component(cell_index);
                     static constexpr auto number_of_child = math::pow(2, dimension);
 
                     auto child_morton_index{child_cell.index()};
                     auto parent_morton_index{child_morton_index >> dimension};
+                    // std::cout << " cell index " << child_morton_index << " its parent_morton_index "
+                    //           << parent_morton_index << std::endl;
 
                     for(auto p: parent_groups)
                     {
@@ -107,7 +119,8 @@ namespace scalfmm::algorithms::omp::pass
                             int parent_index_in_group = p->component_index(parent_morton_index);
                             assertm(parent_index_in_group != -1, "Upward pass: parent cell not found!");
                             auto const& parent_cell = p->ccomponent(std::size_t(parent_index_in_group));
-
+                            // std::cout << "         parent found " << parent_cell.index() << " locals "
+                            //           << parent_cell.clocals().at(0) << std::endl;
                             const std::size_t child_index = child_morton_index & (number_of_child - 1);
                             l2l(approximation, parent_cell, child_index, child_cell, level_interpolator_index);
                         }
diff --git a/include/scalfmm/algorithms/omp/leaf_to_cell.hpp b/include/scalfmm/algorithms/omp/leaf_to_cell.hpp
index f1b53a405d3c2a89c3d8e23602941292ea5bc584..d6b251490eda627a8b380d788018399539cfeef7 100644
--- a/include/scalfmm/algorithms/omp/leaf_to_cell.hpp
+++ b/include/scalfmm/algorithms/omp/leaf_to_cell.hpp
@@ -53,9 +53,10 @@ namespace scalfmm::algorithms::omp::pass
             auto group_cell_raw = &(*group_of_cell_begin)->ccomponent(0).cmultipoles(0);
             // clang-format off
 #pragma omp task untied default(none) firstprivate(group_of_leaf_begin, group_of_cell_begin, group_cell_raw)  \
-  shared(approximation, far_field) depend(inout : group_cell_raw[0]) priority(prio)
+  shared(approximation, far_field, std::cout) depend(inout : group_cell_raw[0]) priority(prio)
             // clang-format on
             {
+                // std::cout << "leaf_to_Cell(task) dependencies(out)  " << group_cell_raw << std::endl << std::flush;
                 // Can be a task(in:iterParticles, out:iterCells)
                 auto leaf_begin = (*group_of_leaf_begin)->cbegin();
                 auto cell_begin = (*group_of_cell_begin)->begin();
diff --git a/include/scalfmm/algorithms/omp/transfer.hpp b/include/scalfmm/algorithms/omp/transfer.hpp
index d24931582f7230fe6cfceb6a84fd41c74e76f903..c8406be836e043298d55add80872d8a2b685b516 100644
--- a/include/scalfmm/algorithms/omp/transfer.hpp
+++ b/include/scalfmm/algorithms/omp/transfer.hpp
@@ -63,21 +63,40 @@ namespace scalfmm::algorithms::omp::pass
         ///            loop on the groups at level level
         const auto num_threads{omp_get_num_threads()};
         for(auto cell_target_level_it = begin_cell_target_level_it; cell_target_level_it != end_cell_target_level_it;
-            ++cell_target_level_it, ++num)
+            ++cell_target_level_it)
         {
             // get() because cell_target_level_it is a shared pointer
             auto group_ptr = cell_target_level_it->get();
             // dependence on the first local of the group
             auto ptr_local_raw = &(group_ptr->ccomponent(0).clocals(0));
-            static constexpr auto prio{priorities::m2l};
+            auto prio{priorities::m2l};
 
+#ifdef SCALFMM_USE_MPI
+            // Increase th priority on the last group (For L2L in MPI)
+            if(cell_target_level_it == (end_cell_target_level_it - 1))
+            {
+                prio = priorities::max;
+            }
+#endif
+            //
+            // std::cout << "M2L level = " << level << std::endl << std::flush;
+            // io::print("M2L dependencies(transfer)(in): ", group_ptr->csymbolics().group_dependencies_m2l);
+            // std::cout << "M2L dep(inout) on groupe ptr_local_raw  " << ptr_local_raw << std::endl << std::flush;
+
+            //
             // clang-format off
-#pragma omp task untied default(none) firstprivate(group_ptr, ptr_local_raw) shared(approximation, buffers)            \
+#pragma omp task untied default(none) firstprivate(group_ptr, ptr_local_raw, level) shared(approximation, buffers, std::clog)            \
   depend(iterator(it = 0  : std::size(group_ptr->csymbolics().group_dependencies_m2l)),                                \
          in  : (group_ptr->csymbolics().group_dependencies_m2l.at(it)[0]))   depend(inout : ptr_local_raw[0]) priority(prio)
             // clang-format on
             {
                 const auto thread_id{omp_get_thread_num()};
+                // io::print(std::clog,
+                //           "m2l(task) dependencies(transfer)(in): ", group_ptr->csymbolics().group_dependencies_m2l);
+                // std::clog << "m2l(task) start  task dep(inout) on groupe ptr_local_raw  " << ptr_local_raw
+                //           << " level=" << level << std::endl
+                //           << std::flush;
+
                 ///////////////////////////////////////////////////////////////////////////////////////
                 //          loop on the leaves of the current group
                 for(std::size_t index_in_group{0}; index_in_group < std::size(*group_ptr); ++index_in_group)
@@ -90,7 +109,12 @@ namespace scalfmm::algorithms::omp::pass
                     // post-processing the leaf if necessary
                     approximation.apply_multipoles_postprocessing(target_cell, *buffers.at(thread_id), thread_id);
                     approximation.buffer_reset(*buffers.at(thread_id));
+                    // std::clog << "m2l(task) end cell index " << target_cell.index() << " locals "
+                    //           << target_cell.locals().at(0) << std::endl;
                 }
+                // std::clog << "m2l(task) end  task dep(inout) on groupe ptr_local_raw  " << ptr_local_raw
+                //           << " level=" << level << std::endl
+                //           << std::flush;
             }   // end pragma task
             /// post-processing the group if necessary
         }
diff --git a/include/scalfmm/algorithms/omp/upward.hpp b/include/scalfmm/algorithms/omp/upward.hpp
index 7738ae44c0dacf661120e07f4352807ac42408e7..f5e70fed7ea74761497a7f9438d16d300cfc3f26 100644
--- a/include/scalfmm/algorithms/omp/upward.hpp
+++ b/include/scalfmm/algorithms/omp/upward.hpp
@@ -5,7 +5,7 @@
 #ifndef SCALFMM_ALGORITHMS_OMP_UPWARD_HPP
 #define SCALFMM_ALGORITHMS_OMP_UPWARD_HPP
 
-#ifdef _OPENMP
+#include <limits>
 
 #include "scalfmm/operators/m2m.hpp"
 #include "scalfmm/operators/tags.hpp"
@@ -13,7 +13,7 @@
 #include "scalfmm/utils/massert.hpp"
 #include "scalfmm/utils/math.hpp"
 
-#include <limits>
+#ifdef _OPENMP
 #include <omp.h>
 
 namespace scalfmm::algorithms::omp::pass
@@ -43,8 +43,10 @@ namespace scalfmm::algorithms::omp::pass
           (approximation.cell_width_extension() < std::numeric_limits<value_type>::epsilon()) ? 2 : level;
         //
         // iterator on the groups of cells (child level)
-        auto group_of_child_cell_begin = tree.begin_cells(level + 1);
+        //
+        auto group_of_child_cell_begin = tree.begin_mine_cells(level + 1);
         auto group_of_child_cell_end = tree.end_cells(level + 1);
+
         // iterator on the groups of cells (current level)
         auto group_of_cell_begin = tree.begin_mine_cells(level);
         auto group_of_cell_end = tree.end_mine_cells(level);
@@ -75,13 +77,17 @@ namespace scalfmm::algorithms::omp::pass
 
             start_range_dependencies = --end_range_dependencies;
             // clang-format off
-  #pragma omp task untied default(none) firstprivate( group_parent, start_range_dependencies_tmp, end_range_dependencies_tmp,level_interpolator_index)       \
+  #pragma omp task untied default(none) firstprivate( group_parent, start_range_dependencies_tmp, end_range_dependencies_tmp,level_interpolator_index, group_parent_raw)       \
      shared(std::cout, approximation )                                                    \
   depend(iterator(std::size_t it = 0 : grp_parent_sym.group_dependencies_m2m_in.size()),  \
                    in: (grp_parent_sym.group_dependencies_m2m_in.at(it))[0])              \
   depend(inout: group_parent_raw[0]) priority(prio)
             // clang-format on
             {
+                // io::print("m2m(task) dependencies(in): ", group_parent->csymbolics().group_dependencies_m2m_in);
+                // std::cout << "m2m(task) run  task dep(inout) on group group_parent_raw  " << group_parent_raw
+                //           << std::endl
+                //           << std::flush;
                 std::vector<ptr_child_groups_type> child_groups;
 
                 while(start_range_dependencies_tmp != end_range_dependencies_tmp)
@@ -106,6 +112,8 @@ namespace scalfmm::algorithms::omp::pass
                                 m2m(approximation, child_cell, child_index, parent_cell, level_interpolator_index);
                             }
                         }
+                        // std::cout << "m2m(task) end cell index " << parent_cell.index() << " multipoles "
+                        //           << parent_cell.multipoles().at(0) << std::endl;
                     }
                     approximation.apply_multipoles_preprocessing(parent_cell, omp_get_thread_num());
                 }
diff --git a/include/scalfmm/algorithms/sequential/sequential.hpp b/include/scalfmm/algorithms/sequential/sequential.hpp
index cb4b53651e3e9f9ad9ef8ccbffa11fcb5ccc6c1e..b002c930499ed743fbe6da8a74d2db803e602853 100644
--- a/include/scalfmm/algorithms/sequential/sequential.hpp
+++ b/include/scalfmm/algorithms/sequential/sequential.hpp
@@ -204,6 +204,18 @@ namespace scalfmm::algorithms::sequential
                     timers["p2p"].tac();
                 }
             }
+#ifdef _DEBUG_BLOCK_DATA
+            std::clog << "\n  blocks after direct\n";
+            int tt{0};
+            auto group_of_leaves = tree_target.vector_of_leaf_groups();
+
+            for(auto pg: group_of_leaves)
+            {
+                std::clog << "block index " << tt++ << std::endl;
+                pg->cstorage().print_block_data(std::clog);
+            }
+            std::clog << "  ---------------------------------------------------\n";
+#endif
 
             // print time of each pass
             if constexpr(options::has(s, options::timit))
diff --git a/include/scalfmm/interpolation/barycentric/barycentric_interpolator.hpp b/include/scalfmm/interpolation/barycentric/barycentric_interpolator.hpp
index f6786301390edfcc676a062acb2654c65e6bbbe9..d631bb50b06f821049b3053483c0e4504afe2585 100644
--- a/include/scalfmm/interpolation/barycentric/barycentric_interpolator.hpp
+++ b/include/scalfmm/interpolation/barycentric/barycentric_interpolator.hpp
@@ -9,6 +9,7 @@
 #include "scalfmm/container/variadic_adaptor.hpp"
 #include "scalfmm/interpolation/grid_storage.hpp"
 #include "scalfmm/interpolation/interpolator.hpp"
+#include "scalfmm/interpolation/m2l_handler.hpp"
 #include "scalfmm/interpolation/mapping.hpp"
 #include "scalfmm/interpolation/permutations.hpp"
 #include "scalfmm/interpolation/traits.hpp"
@@ -349,6 +350,15 @@ namespace scalfmm::interpolation
             return xt::xarray<value_type>(std::vector{math::pow(order, dimension)}, value_type(1.));
 #endif
         }
+        /**
+             * @brief Compute he memory in bytes used by the interpolator
+             *
+             * @return the memory used by the interpolator
+             */
+        [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+        {
+            return base_m2l_handler_type::memory_usage();
+        }
 
       private:
         /**
diff --git a/include/scalfmm/interpolation/chebyshev/chebyshev_interpolator.hpp b/include/scalfmm/interpolation/chebyshev/chebyshev_interpolator.hpp
index ffea501c2fca7bda1d36bec3a0c9200135c7b813..5d45d38d60d2a526c7c92814d2dfbd9d3d250a48 100644
--- a/include/scalfmm/interpolation/chebyshev/chebyshev_interpolator.hpp
+++ b/include/scalfmm/interpolation/chebyshev/chebyshev_interpolator.hpp
@@ -9,6 +9,7 @@
 #include "scalfmm/container/variadic_adaptor.hpp"
 #include "scalfmm/interpolation/grid_storage.hpp"
 #include "scalfmm/interpolation/interpolator.hpp"
+#include "scalfmm/interpolation/m2l_handler.hpp"
 #include "scalfmm/interpolation/mapping.hpp"
 #include "scalfmm/interpolation/permutations.hpp"
 #include "scalfmm/interpolation/traits.hpp"
@@ -364,6 +365,15 @@ namespace scalfmm::interpolation
             meta::looper_range<dimension>{}(generate_weights, starts, stops);
             return roots_weights;
         }
+        /**
+             * @brief ompute he memory in bytes used by the interpolator
+             *
+             * @return the memory used by the interpolator
+             */
+        [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+        {
+            return base_m2l_handler_type::memory_usage();
+        }
 
       private:
         /**
diff --git a/include/scalfmm/interpolation/interpolator.hpp b/include/scalfmm/interpolation/interpolator.hpp
index 341aa998790fadc1abcbe999f0f9fabf01ef8735..0f4b1b1af22c825260f050fd31a032e6aa476cb1 100644
--- a/include/scalfmm/interpolation/interpolator.hpp
+++ b/include/scalfmm/interpolation/interpolator.hpp
@@ -7,7 +7,6 @@
 
 #include "scalfmm/container/point.hpp"
 #include "scalfmm/interpolation/builders.hpp"
-#include "scalfmm/interpolation/m2l_handler.hpp"
 #include "scalfmm/interpolation/mapping.hpp"
 #include "scalfmm/interpolation/permutations.hpp"
 #include "scalfmm/matrix_kernels/mk_common.hpp"
@@ -522,48 +521,30 @@ namespace scalfmm::interpolation
                 return *static_cast<derived_type*>(this);
             }
 
-          private:
             /**
-             * @brief number of terms of the expansion (1d)
+             * @brief ompute he memory in bytes used by the interpolator
              *
+             * @return the memory used by the interpolator
              */
-            const size_type m_order{};
+            [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+            {
+                return this->derived_cast().memory_usage();
+            }
 
-            /**
-             * @brief number of modes m_order^dimension
-             *
-             */
-            const size_type m_nnodes{};
+          private:
+            const size_type m_order{};   ///< number of terms of the expansion (1d)
 
-            /**
-             * @brief height of the tree
-             *
-             */
-            const size_type m_tree_height{};
+            const size_type m_nnodes{};   ///<  number of modes m_order^dimension
 
-            /**
-             * @brief width of the simulation box
-             *
-             */
-            const value_type m_root_cell_width{};
+            const size_type m_tree_height{};   ///<  height of the tree
 
-            /**
-             * @brief width of the extension of the cell
-             *
-             */
-            const value_type m_cell_width_extension{};
+            const value_type m_root_cell_width{};   ///< width of the simulation box
 
-            /**
-             * @brief true if we use the cell extension
-             *
-             */
-            const bool m_use_cell_width_extension{};
+            const value_type m_cell_width_extension{};   ///< width of the extension of the cell
 
-            /**
-             * @brief
-             *
-             */
-            array_type m_child_parent_interpolators{};
+            const bool m_use_cell_width_extension{};   ///< true if we use the cell extension
+
+            array_type m_child_parent_interpolators{};   ///<
 
             /**
              * @brief
diff --git a/include/scalfmm/interpolation/m2l_handler.hpp b/include/scalfmm/interpolation/m2l_handler.hpp
index e92e1122b18fa17bb08366be622c90a898813071..1a1b5fec45dba2a756f80ea93b6224903a3ed160 100644
--- a/include/scalfmm/interpolation/m2l_handler.hpp
+++ b/include/scalfmm/interpolation/m2l_handler.hpp
@@ -383,7 +383,44 @@ namespace scalfmm::interpolation
             {
                 return m_interactions_matrices;
             }
+            /**
+             * @brief Compute he memory in bytes used by the interpolator
+             *
+             * @return the memory used by the interpolator
+             */
+            [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+            {
+                std::size_t memory{0};
+                auto size = m_interactions_matrices.size();
 
+                for(auto& mat: m_interactions_matrices)
+                {
+                    // mat xtensor knxkm
+                    // std::cout << cpt++ << " ";
+                    for(int i = 0; i < mat.shape()[0]; ++i)
+                    {
+                        for(int j = 0; j < mat.shape()[1]; ++j)
+                        {
+                            auto K = mat.at(i, j);
+                            // dense case
+                            if constexpr(std::is_same_v<settings, options::dense_> or
+                                         std::is_same_v<settings, options::fft_>)
+                            {
+                                memory += K.size() * sizeof(typename k_tensor_type::value_type);
+                            }
+                            else if constexpr(std::is_same_v<settings, options::low_rank_>)
+                            {
+                                // Low rang (tuple of two matricies)
+                                auto const& A = std::get<0>(K);
+                                auto const& B = std::get<1>(K);
+                                memory += (A.size() + B.size()) *
+                                          sizeof(typename std::tuple_element_t<0, k_tensor_type>::value_type);
+                            }
+                        }
+                    }
+                }
+                return memory;
+            }
             /**
              * @brief
              *
@@ -1234,12 +1271,8 @@ namespace scalfmm::interpolation
             }
 
           private:
-            /**
-             * @brief
-             *
-             */
             std::vector<interaction_matrix_type, XTENSOR_DEFAULT_ALLOCATOR(interaction_matrix_type)>
-              m_interactions_matrices{};
+              m_interactions_matrices{};   ///<  The M2L matricies
 
             /**
              * @brief
diff --git a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
index 612e322fb4bafbedc21bf2bee402fb2b36e7a051..635d23eb2a2cc80237e0749943959a869e898b58 100644
--- a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
+++ b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
@@ -17,6 +17,7 @@
 #include "scalfmm/interpolation/generate_circulent.hpp"
 #include "scalfmm/interpolation/grid_storage.hpp"
 #include "scalfmm/interpolation/interpolator.hpp"
+#include "scalfmm/interpolation/m2l_handler.hpp"
 #include "scalfmm/interpolation/mapping.hpp"
 #include "scalfmm/interpolation/traits.hpp"
 #include "scalfmm/interpolation/uniform/uniform_storage.hpp"
@@ -284,7 +285,7 @@ namespace scalfmm::interpolation
         }
 
         /**
-         * @brief
+         * @brief Compute the weights of the quadrature
          *
          * @param order
          * @return xt::xarray<value_type>
@@ -293,6 +294,16 @@ namespace scalfmm::interpolation
         {
             return xt::xarray<value_type>(std::vector{math::pow(order, dimension)}, value_type(1.));
         }
+
+        /**
+             * @brief ompute he memory in bytes used by the interpolator
+             *
+             * @return the memory used by the interpolator
+             */
+        [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+        {
+            return base_m2l_handler_type::memory_usage();
+        }
     };
 
     /**
@@ -662,7 +673,15 @@ namespace scalfmm::interpolation
             }
             return L * sum;
         }
-
+        /**
+             * @brief ompute he memory in bytes used by the interpolator
+             *
+             * @return the memory used by the interpolator
+             */
+        [[nodiscard]] inline auto memory_usage() const noexcept -> std::size_t const
+        {
+            return base_m2l_handler_type::memory_usage();
+        }
         /**
          * @brief Returns the buffers initialized for the optimized fft
          *
diff --git a/include/scalfmm/lists/utils.hpp b/include/scalfmm/lists/utils.hpp
index 57643087baeb2866e26034091ac4713b337e3e6f..671b03bc27a3ee081459b42cb37d311e56e4c42a 100644
--- a/include/scalfmm/lists/utils.hpp
+++ b/include/scalfmm/lists/utils.hpp
@@ -926,6 +926,33 @@ namespace scalfmm::list
                                                [](auto const& a, auto const& b) { return a->index() < b->index(); });
                                  });
     }
+    // template<typename TREE>
+    // void clean_p2p_ghosts_interactions(TREE& tree)
+    // {
+    //     auto begin_groups{tree.end_mine_leaves()};
+    //     const auto end_groups{tree.cend_leaves()};
+    //     while(begin_groups != end_groups)
+    //     {
+    //         //
+    //         ++begin_groups;
+    //     }
+    //     for(auto group_leaf_iterator_begin = begin_groups; group_leaf_iterator_begin != end_groups;
+    //         ++group_leaf_iterator_begin)
+    //     {
+    //         auto group = (*group_leaf_iterator_begin);
+    //         auto& group_symbolics = group->symbolics();
+
+    //         auto& out_of_group = group_symbolics.outside_interactions;
+
+    //         for(auto&& leaf: (*group_leaf_iterator_begin)->components())
+    //         {
+    //             // get symbolics
+    //             auto& leaf_symbolics = leaf.symbolics();
+    //             // remove the interactions inside the bloc
+    //             leaf_symbolics.number_of_neighbors = 0;
+    //         }
+    // }
+    // }
 }   // namespace scalfmm::list
 
 #endif   // SCALFMM_LISTS_UTIL_HPP
diff --git a/include/scalfmm/operators/count_kernel/count_kernel.hpp b/include/scalfmm/operators/count_kernel/count_kernel.hpp
index a7417529111b754d588c3ef00103ca25c614b53f..bd6e006737cb0ee8545e83eea44a48eebf4224be 100644
--- a/include/scalfmm/operators/count_kernel/count_kernel.hpp
+++ b/include/scalfmm/operators/count_kernel/count_kernel.hpp
@@ -82,6 +82,7 @@ namespace count_kernels
              *
              */
             static constexpr int separation_criterion{1};
+            const std::string name() const { return std::string("count kernel "); }
         };
 
         /**
diff --git a/include/scalfmm/parallel/comm_access.hpp b/include/scalfmm/parallel/comm_access.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..d7ff3653cb26a03d9204b5f46f7334d281ce3352
--- /dev/null
+++ b/include/scalfmm/parallel/comm_access.hpp
@@ -0,0 +1,100 @@
+#pragma once
+
+#include <vector>
+
+#ifdef SCALFMM_USE_MPI
+#include <mpi.h>
+#endif
+template<typename morton_type, typename grp_access_type>
+class transferDataAccess
+{
+  private:
+    // vector per level and per process
+    std::vector<std::vector<std::vector<morton_type>>> m_send_morton;
+    std::vector<std::vector<std::vector<grp_access_type>>> m_receive_cells_access;
+    std::vector<std::vector<std::vector<grp_access_type>>> m_send_cells_access;
+
+#ifdef SCALFMM_USE_MPI
+    std::vector<std::vector<MPI_Datatype>> m_send_multipoles_type;
+    std::vector<std::vector<MPI_Datatype>> m_receive_multipoles_type;
+#endif
+
+  public:
+    auto inline get_send_multipole_types(const int& level) -> std::vector<MPI_Datatype>&
+    {
+        return m_send_multipoles_type[level];
+    }
+
+    auto inline print_send_multipole_types(const int& level) -> void
+    {
+        auto const& type = m_send_multipoles_type[level];
+        for(int p = 0; p < type.size(); ++p)
+        {
+            std::cout << "   ptr_data_type(" << p << ") " << &(type[p]) << " level: " << level << std::endl
+                      << std::flush;
+        }
+    }
+    auto inline get_receive_multipole_types(const int& level) -> std::vector<MPI_Datatype>&
+    {
+        return m_receive_multipoles_type[level];
+    }
+    auto inline send_morton_indexes(int const& level, int const& proc) -> std::vector<morton_type>&
+    {
+        return m_send_morton[level][proc];
+    }
+    auto inline send_morton_indexes(int const& level) -> std::vector<std::vector<morton_type>>&
+    {
+        return m_send_morton[level];
+    }
+    auto inline receive_cells_access(int const& level) -> std::vector<std::vector<grp_access_type>>&
+    {
+        return m_receive_cells_access[level];
+    }
+
+    auto inline send_cells_access(int const& level) -> std::vector<std::vector<grp_access_type>>&
+    {
+        return m_send_cells_access[level];
+    }
+
+    transferDataAccess(const int tree_height, const int nb_proc)
+    {
+        m_receive_multipoles_type.resize(tree_height);
+        m_send_multipoles_type.resize(tree_height);
+        m_send_morton.resize(tree_height);
+        for(auto& vec: m_send_morton)
+        {
+            vec.resize(nb_proc);
+        }
+        m_receive_cells_access.resize(tree_height);
+        for(auto& vec: m_receive_cells_access)
+        {
+            vec.resize(nb_proc);
+        }
+        m_send_cells_access.resize(tree_height);
+        for(auto& vec: m_send_cells_access)
+        {
+            vec.resize(nb_proc);
+        }
+    }
+};
+
+class UpDownDataAccess
+{
+    // vector per level and per process
+    bool done{false};          // if built
+    int nb_child_send{0};      // number of child to send
+    int nb_child_receive{0};   // number of child to receive
+
+  public:
+    UpDownDataAccess()
+      : done(false)
+      , nb_child_send(0)
+      , nb_child_receive(0)
+    {
+    }
+    bool is_done() { return done; }
+    auto get_nb_child_to_send() const { return nb_child_send; }
+    void set_nb_child_to_send(const int n) { nb_child_send = n; }
+    auto get_nb_child_to_receive() const { return nb_child_receive; }
+    void set_nb_child_to_receive(const int n) { nb_child_receive = n; }
+};
\ No newline at end of file
diff --git a/include/scalfmm/parallel/mpi/comm.hpp b/include/scalfmm/parallel/mpi/comm.hpp
index bf1219e99ec1c61513223ac3eae083f649748d0a..f26399e2a7a6ed52543bac6e2eb6c8d1c06af9e4 100644
--- a/include/scalfmm/parallel/mpi/comm.hpp
+++ b/include/scalfmm/parallel/mpi/comm.hpp
@@ -1,9 +1,8 @@
-// --------------------------------
-// See LICENCE file at project root
-// File : scalfmm/parallel/mpi/comm.hpp
-// --------------------------------
 #pragma once
 
+#include <stdexcept>
+#include <vector>
+
 #include "scalfmm/container/particle.hpp"
 #include "scalfmm/meta/utils.hpp"
 #include "scalfmm/parallel/utils.hpp"
@@ -12,59 +11,106 @@
 
 #include <mpi.h>
 
-#include <vector>
-
 namespace scalfmm::parallel::comm
 {
     using comm_type = cpp_tools::parallel_manager::mpi::communicator;
 
+    // Debug functions
+    namespace debug
+    {
+        template<typename TreeType>
+        void print_all_cells(TreeType& tree, int level, std::string title)
+        {
+            std::cout << "M2L  --  level " << level << "   --  " << std::endl;
+            scalfmm::component::for_each_mine_component(tree.begin_cells(level), tree.end_cells(level),
+                                                        [&title](auto const& cell)
+                                                        {
+                                                            std::cout << title << " morton " << cell.index()
+                                                                      << "  multipoles "
+                                                                      << cell.transfer_multipoles().at(0) << "  locals "
+                                                                      << cell.locals().at(0) << std::endl
+                                                                      << std::flush;
+                                                        });
+        }
+        template<typename VectorAccesType>
+        inline void set_values(VectorAccesType& cells_to_send_access)
+        {
+            std::cout << " debug::set_values " << cells_to_send_access.size() << std::endl << std::flush;
+            // iterate on the cells
+            int start{20};
+            for(auto access: cells_to_send_access)
+            {
+                auto& cell = (*(access.first))->component(access.second);
+
+                //			     std::cout << " morton to find " << index_to_send[idx] << " cell found "
+                //			               << (*grp_ptr)->component(pos).csymbolics().morton_index << '\n';
+                auto& m = cell.transfer_multipoles();
+                auto nb_m = m.size();
+                //			    std::cout << "          nb_m" <<  m.size() <<std::endl;
+                for(std::size_t i{0}; i < nb_m; ++i)
+                {
+                    auto& ten = m.at(i);
+                    // std::copy(std::begin(ten), std::end(ten), it);
+                    ten[0] = start;
+                    std::cout << " start " << start << "   " << ten[0] << std::endl;
+                    ++start;
+                }
+            }
+        }
+    }   // namespace debug
+
     /**
      * @brief Determines the Morton index vector to be received from processor p  (STEP 1)
-     *
-     * Determines the Morton index vector to be received from processor p? In addition, for each Morton index we
-     * store the cell, i.e. a pointer to its group and the index within the group (group_ptr, index). This will
-     * enable us to insert the multipoles received from processor p directly into the cell.
-     *
-     * leaf_to_receive_access: a vector of vector of pair (the iterator on the group ,the position of the cell in the group)
-     *  leaf_to_receive_access[p] is the position vector of cells in groups whose Morton index comes from processor p
-     *  leaf_to_receive_access[p][i] a pair (the iterator on the group ,the position of the cell in the group)
-     * vector of size nb_proc
-     *    - nb_messages_to_receive: the number of morton indices to  exchange with processor p
-     *    - nb_messages_to_send: the number of morton indices to  send tp processor p
-     *    - morton_to_receive: the morton indices to  exchange with processor p
-     *
-     * @tparam DistributionType
-     * @tparam IteratorType
-     * @tparam VectorOfVectorStructType
-     * @tparam VectorOfVectorType
+     * 
+     * Determines the Morton index vector to be received from processor p. In addition, for each Morton index we
+    * store the cell, i.e. a pointer to its group and the index within the group (group_ptr, index). This will
+    * enable us to build the MPi data type the multipoles received from processor p directly into the cell.
+    *
+    * leaf_to_receive_access: a vector of vector of pair (the iterator on the group ,the position of the cell in the group)
+    *  leaf_to_receive_access[p] is the position vector of cells in groups whose Morton index comes from processor p
+    *  leaf_to_receive_access[p][i] a pair (the iterator on the group ,the position of the cell in the group)
+    * vector of size nb_proc
+    *    - nb_messages_to_receive: the number of morton indices to exchange with processor p
+    *    - nb_messages_to_send: the number of morton indices to  send tp processor p
+    *    - morton_to_receive: the morton indices to  exchange with processor p
+    *
+     * @tparam distribution_type 
+     * @tparam iterator_type 
+     * @tparam vector_vector_struct_type 
+     * @tparam vector_vector_type 
      * @param[in] comm   the mpi communicator
      * @param[in] begin_left_ghost The iterator of the first ghost on the left
      * @param[in] end_left_ghost  The iterator of the last ghost on the left
      * @param[in] begin_right_ghost   The iterator of the first ghost on the right
      * @param[in] end_right_ghost The iterator of the last ghost on the right
-     * @param[in] distrib  the data distribution
+     * @param[in] distrib  the data distribution 
      * @param[out] nb_messages_to_receive the number of morton indices to exchange with processor p
      * @param[out] nb_messages_to_send the number of morton indices to send tp processor p
      * @param[out] leaf_to_receive_access For each component a direct access to it (iterator on group, position into the group)
      * @param[out] morton_to_receive for each process the vector of Morton indexes to receive
-     */
-    template<typename DistributionType, typename IteratorType, typename VectorOfVectorStructType,
-             typename VectorOfVectorType>
-    inline auto start_step1(comm_type& comm, IteratorType begin_left_ghost, IteratorType end_left_ghost,
-                            IteratorType begin_right_ghost, IteratorType end_right_ghost,
-                            DistributionType const& distrib, std::vector<int>& nb_messages_to_receive,
-                            std::vector<int>& nb_messages_to_send, VectorOfVectorStructType& leaf_to_receive_access,
-                            VectorOfVectorType& morton_to_receive) -> void
+    */
+    template<typename distribution_type, typename iterator_type, typename vector_vector_struct_type,
+             typename vector_vector_type>
+    inline void start_step1(comm_type& comm, iterator_type begin_left_ghost, iterator_type end_left_ghost,
+                            iterator_type begin_right_ghost, iterator_type end_right_ghost,
+                            distribution_type const& distrib, std::vector<int>& nb_messages_to_receive,
+                            std::vector<int>& nb_messages_to_send, vector_vector_struct_type& leaf_to_receive_access,
+                            vector_vector_type& morton_to_receive, bool verbose = false)
     {
         // We iterate on the ghosts
 
-        // function to fill the struture to_receive for groups between first_group_ghost and last_group_ghost
-        auto build_receive = [&nb_messages_to_receive, &leaf_to_receive_access, &distrib,
-                              &morton_to_receive](auto first_group_ghost, auto last_group_ghost)
+        // function to fill the struture to_receive the cells for groups between first_group_ghost and last_group_ghost
+        auto build_receive = [&nb_messages_to_receive, &leaf_to_receive_access, &distrib, &morton_to_receive,
+                              verbose](auto first_group_ghost, auto last_group_ghost)
         {
+            // if(verbose)
+            //     std::cout << "step1 build_receive " << std::distance(first_group_ghost, last_group_ghost) << std::endl
+            //               << std::flush;
             for(auto grp_ptr = first_group_ghost; grp_ptr != last_group_ghost; ++grp_ptr)
             {
                 int idx{0};
+                // if(verbose)
+                //     std::cout << "      idx=   " << idx << std::endl << std::flush;
                 // iterate on the cells
                 for(auto const& component: (*grp_ptr)->components())
                 {
@@ -73,94 +119,133 @@ namespace scalfmm::parallel::comm
                     ++nb_messages_to_receive[i];
                     leaf_to_receive_access[i].push_back(std::make_pair(grp_ptr, idx));
                     morton_to_receive[i].push_back(morton);
+                    // if(verbose)
+                    //     std::cout << "     step 1    " << idx << "  " << *grp_ptr << "  " << morton << "  proc " << i
+                    // << std::endl;
                     ++idx;
                 }
             }
         };
         // Start on the left ghosts
+        // if(verbose)
+        //     std::cout << "    Left \n" << std::flush;
         if(std::distance(begin_left_ghost, end_left_ghost) > 0)
         {
             build_receive(begin_left_ghost, end_left_ghost);
         }
         // Start on the ghosts on the right
+        // if(verbose)
+        //     std::cout << "    right \n" << std::flush;
+
         if(std::distance(begin_right_ghost, end_right_ghost) > 0)
         {
             build_receive(begin_right_ghost, end_right_ghost);
         }
-        //	io::print("step  nb_messages_to_receive[" + std::to_string(p) + "] ", nb_messages_to_receive.data[p]);
-        // Do we need to sort them ?
+        // else
+        // {
+        //     std::cout << "      No ghost group" << std::endl << std::flush;
+        // }
+        // Faut-il les trier ???
         int p{0};
-        //	io::print("step 1  nb_messages_to_receive ", nb_messages_to_receive);
-        //	std::cout << "    morton_to_receive.size() " << morton_to_receive.size() <<std::endl;
-        //	for (auto & vec : morton_to_receive){
-        //
-        //	  auto last = std::unique(vec.begin(), vec.end());
-        //	  vec.erase(last, vec.end());
-        //	  io::print("step 1    morton_to_receive[" + std::to_string(p++) + "] ", vec);
-
-        //	}
-        /*
-	p = 0 ;
-	io::print("step 1  nb_messages_to_send ", nb_messages_to_send);
-	for (auto & vec : ){
-
-	  auto last = std::unique(vec.begin(), vec.end());
-	  vec.erase(last, vec.end());
-	  io::print("step 1    [" + std::to_string(p++) + "] ", vec);
-
-	}
-	*/
+        // if(verbose)
+        //     io::print("step 1  nb_messages_to_receive ", nb_messages_to_receive);
+        // if(verbose)
+        //     std::cout << " step1   morton_to_receive.size() " << morton_to_receive.size() << std::endl;
+        for(auto& vec: morton_to_receive)
+        {
+            //   auto last = std::unique(vec.begin(), vec.end());
+            //   vec.erase(last, vec.end());
+            if(verbose)
+                io::print("step 1    morton_to_receive[" + std::to_string(p++) + "] ", vec);
+        }
+
+        p = 0;
+        // if(verbose)
+        //     io::print("step 1  nb_messages_to_send ", nb_messages_to_send);
+        // if(verbose)
+        // {
+        //     for(auto& vec: morton_to_receive)
+        //     {
+        //         io::print("step 1    morton_to_receive[" + std::to_string(p++) + "] ", vec);
+        //     }
+        // }
+
         ////////////////////
         /// Exchange the morton indexes with processor p
         auto mpi_int_type = cpp_tools::parallel_manager::mpi::get_datatype<int>();
+        if(verbose)
+            io::print("step 1  nb_messages_to_receive ", nb_messages_to_receive);
+        // comm.barrier();
+        // std::clog << "comm.alltoall\n";
         comm.alltoall(nb_messages_to_receive.data(), 1, mpi_int_type, nb_messages_to_send.data(), 1, mpi_int_type);
+        // std::clog << "znd comm.alltoall\n";
+
+        if(verbose)
+            io::print("end step 1    nb_messages_to_send ", nb_messages_to_send);
     }
 
     /**
      * @brief We can now exchange the morton indices (STEP 2)
-     *
+     * 
      *  Morton's list of indices to send their data (mutipoles/particles) to proc p
-     * @tparam VectorOfVectorType
+     * @tparam vector_vector_type 
      * @param[in] nb_proc number of mpi processes
      * @param[in] rank the mpi rank
      * @param[in] comm the communicator
      * @param[in] nb_messages_to_receive for each process the number of message to receive
      * @param[in] nb_messages_to_send  for each process the number of message to send
      * @param[in] morton_to_receive for  each process the vector of Morton indexes to receive
-     * @param[out]  for each process the vector of Morton indexes to send
+     * @param[out] morton_to_send for each process the vector of Morton indexes to send
     */
-    template<typename VectorOfVectorType>
+    template<typename vector_vector_type>
     inline void start_step2(int const& nb_proc, int const& rank, comm_type& comm,
                             std::vector<int>& nb_messages_to_receive, std::vector<int>& nb_messages_to_send,
-                            VectorOfVectorType& morton_to_receive, VectorOfVectorType&)
+                            vector_vector_type& morton_to_receive, vector_vector_type& morton_to_send,
+                            bool verbose = false)
     {
-        using mortonIdx_type = typename VectorOfVectorType::value_type::value_type;
+        using mortonIdx_type = typename vector_vector_type::value_type::value_type;
         std::vector<cpp_tools::parallel_manager::mpi::request> tab_mpi_status;
         //
         auto mpi_morton_type = cpp_tools::parallel_manager::mpi::get_datatype<mortonIdx_type>();
+        // io::print(" step2    nb_messages_to_send ", nb_messages_to_send);
         for(auto p = 0; p < nb_proc; ++p)
         {
-            if(p == rank)
-            {
-                continue;
-            }
-            // send the morton  indexes morton_to_receive
-            if(nb_messages_to_send[p] != 0)
+            // if(p == rank)
+            // {
+            //     continue;
+            // // }
+            // std::cout << "proc p " << p << "nb_messages_to_send " << nb_messages_to_send[p] << std::endl;
+            // std::cout << "proc p " << p << "nb_messages_to_send " << nb_messages_to_receive[p] << std::endl;
+
+            // send the morton indexes morton_to_receive
+            if(nb_messages_to_receive[p] != 0)
             {
-                [p].resize(nb_messages_to_send[p]);
+                // io::print(" send to " + std::to_string(p) + " morton  ", morton_to_receive[p]);
 
-                //		  std::cout << "step 2 me " << rank << " send to " << p << " nb morton= " << nb_messages_to_receive[p]
-                //                           << std::endl;
-                //                 io::print("step 2    morton_to_receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
+                // io::print("step 2    morton_to_receive[" + std::to_string(p) + "] ", morton_to_receive[p]);
 
                 comm.isend(morton_to_receive[p].data(), nb_messages_to_receive[p], mpi_morton_type, p, 600);
             }
-            if(nb_messages_to_receive[p] != 0)
+        }
+        // std::cout << std::endl << std::endl;
+        // std::cout << " step2   Start receive communications \n";
+
+        for(auto p = 0; p < nb_proc; ++p)
+        {
+            // std::cout << "proc p " << p << " nb_messages_to_receive " << nb_messages_to_receive[p] << std::endl;
+            // std::cout << "proc p " << p << " nb_messages_to_send " << nb_messages_to_send[p] << std::endl;
+            morton_to_send[p].resize(nb_messages_to_send[p], -1);
+            // io::print("start_step2 init morton_to_send ", morton_to_send[p]);
+            // Get the morton index to send
+            if(nb_messages_to_send[p] != 0)
             {
-                //                std::cout << "step 2 me " << rank << " receive to " << p << " size= " << nb_messages_to_send[p]
-                //                           << std::endl;
+                // std::cout << "step 2 me " << rank << " receive to " << p
+                //           << " nb_messages_to_receive= " << nb_messages_to_receive[p]
+                //           << " nb_messages_to_send= " << nb_messages_to_send[p] << std::endl
+                //           << std::flush;
 
-                tab_mpi_status.push_back(comm.irecv([p].data(), nb_messages_to_send[p], mpi_morton_type, p, 600));
+                tab_mpi_status.push_back(
+                  comm.irecv(morton_to_send[p].data(), nb_messages_to_send[p], mpi_morton_type, p, 600));
             }
         }
         if(tab_mpi_status.size() > 0)
@@ -168,36 +253,37 @@ namespace scalfmm::parallel::comm
             cpp_tools::parallel_manager::mpi::request::waitall(tab_mpi_status.size(), tab_mpi_status.data());
         }
         // // check
-        /*
-         for(auto p = 0; p < nb_proc; ++p)
-         {
-             io::print("step 2    [" + std::to_string(p) + "] ", morton_to_send[p]);
-         }
-	*/
+        // for(auto p = 0; p < nb_proc; ++p)
+        // {
+        //     io::print("step 2    morton_to_send[" + std::to_string(p) + "] ", morton_to_send[p]);
+        // }
     }
-
     /**
-     * @brief For the vector of Morton indices to be sent to processor p, we construct a direct access to the component
-     *
-     * @tparam IteratorType
-     * @tparam VectorOfVectorStructType
-     * @tparam VectorOfVectorType
+     * @brief For the vector of Morton indices to be sent to processor p, we construct a direct access to the component (cell or leaf)
+     * 
+     * @tparam iterator_type 
+     * @tparam vector_vector_struct_type 
+     * @tparam vector_vector_type 
      * @param nb_proc the number of processors
      * @param begin_grp the first iterator on the group
      * @param end_grp the last iterator on the group
      * @param component_access the access to the component (iterator on group, position into the group)
-     * @param  for each processor the vector of Morton indexes to send
+     * @param morton_to_send for each processor the vector of Morton indexes to send
      */
-    template<typename IteratorType, typename VectorOfVectorStructType, typename VectorOfVectorType>
-    auto build_direct_access_to_leaf(const int nb_proc, IteratorType begin_grp, IteratorType end_grp,
-                                     VectorOfVectorStructType& component_access, VectorOfVectorType const&) -> void
+    template<typename iterator_type, typename vector_vector_struct_type, typename vector_vector_type>
+    auto build_direct_access_to_components(const int nb_proc, iterator_type begin_grp, iterator_type end_grp,
+                                           vector_vector_struct_type& component_access,
+                                           vector_vector_type const& morton_to_send) -> void
     {
-        using access_type = typename VectorOfVectorStructType::value_type;
-        using vector_morton_type = typename VectorOfVectorType::value_type;
-        auto build_index_grp = [](auto begin_grp, auto end_grp, vector_morton_type const& _p, access_type& to_send_p)
+        using access_type = typename vector_vector_struct_type::value_type;
+        using vector_morton_type = typename vector_vector_type::value_type;
+
+        bool verbose{false};
+        auto build_index_grp =
+          [&verbose](auto begin_grp, auto end_grp, vector_morton_type const& morton_to_send_p, access_type& to_send_p)
         {
             int idx{0};
-            int max_idx = _p.size();
+            int max_idx = morton_to_send_p.size();
             to_send_p.resize(max_idx);
             // loop on the groups
             // auto it = std::begin(buffer);
@@ -207,40 +293,71 @@ namespace scalfmm::parallel::comm
                 int start_grp{0};
                 auto const& csymb = (*grp_ptr)->csymbolics();
                 // iterate on the cells
-                while(idx < max_idx and math::between(_p[idx], csymb.starting_index, csymb.ending_index))
+                // if(verbose)
+                //   std::clog << idx << " morton "<< morton_to_send_p[idx] << " in [ "<<csymb.starting_index<< ", " << csymb.ending_index << "[\n";
+                while(idx < max_idx and math::between(morton_to_send_p[idx], csymb.starting_index, csymb.ending_index))
                 {   // find cell inside the group
                     int pos{-1};
                     for(int i = start_grp; i < (*grp_ptr)->size(); ++i)
                     {
                         auto morton = (*grp_ptr)->component(i).csymbolics().morton_index;
-                        if(_p[idx] == morton)
+                        if(morton_to_send_p[idx] == morton)
                         {
                             pos = i;
                             start_grp = i + 1;
                             to_send_p[idx].first = grp_ptr;
                             to_send_p[idx].second = i;
+                            // if (verbose)
+                            //   std::clog << "  m= "<<morton << "  ptr " <<  to_send_p[idx].first->get() << " pos " << i << std::endl;
                             break;
                         }
                     }
                     ++idx;
                 }
             }
+            if(idx != max_idx)
+            {
+                std::cerr << "Didn't found the good number of morton indexes\n";
+                // std::string outName2("bug_direct_access_to_component_rank_" + std::to_string(rank) + ".txt");
+                // std::ofstream out(outName2);
+                // scalfmm::io::trace(out, letGroupTree, 2);
+                // out << "\n" << " \n";
+                io::print(std::cerr, "morton_to_send: ", morton_to_send_p);
+                std::cerr << "\n missing morton: ";
+                // out << "missing morton: ";
+
+                for(int i = idx; i < max_idx; ++i)
+                {
+                    std::cerr << morton_to_send_p[i] << " ";
+                    // out << morton_to_send_p[i] << " ";
+                }
+                std::cerr << "\n";
+                // out.close();
+                throw std::runtime_error(" Missing morton index in building direct compononent access");
+            }
         };
 
         for(auto p = 0; p < nb_proc; ++p)
         {
-            // io::print("    [" + std::to_string(p) + "] ", morton_to_send[p]);
-
-            if([p].size() != 0)
+            // io::print(std::clog, "    morton_to_send[" + std::to_string(p) + "] ", morton_to_send[p]);
+            if(morton_to_send[p].size() != 0)
             {
-                build_index_grp(begin_grp, end_grp, [p], component_access[p]);
-                auto const& elt = component_access[p];
-                // for(auto i = 0; i < elt.size(); ++i)
+                // verbose = p == 3 ? true : false;
+                build_index_grp(begin_grp, end_grp, morton_to_send[p], component_access[p]);
+                // if(p == 3)
                 // {
-                //     std::cout << "     " << p << " "
-                //               << (*(elt[i].first))->component(elt[i].second).csymbolics().morton_index << " "
-                //               << elt[i].second << " " << [p][i] << " nb part "
-                //               << (*(elt[i].first))->component(elt[i].second).size() << std::endl;
+                //     auto const& elt = component_access[p];
+                //     for(auto i = 0; i < elt.size(); ++i)
+                //     {
+                //         std::clog
+                //           << "   -->p=" << p << " ptr="
+                //           << elt[i].first->get()
+                //           //	  << " m=" << (*(elt[i].first))->component(elt[i].second).csymbolics().morton_index
+                //           << "  pos=" << elt[i].second << " m="
+                //           << morton_to_send[p][i]
+                //           //		  << " nb part "   << (*(elt[i].first))->component(elt[i].second).size()
+                //           << std::endl;
+                //     }
                 // }
             }
         }
@@ -248,22 +365,19 @@ namespace scalfmm::parallel::comm
 
     /**
      * @brief Construct the MPI type of the particle according to leaf_to_access
-     *
-     * @tparam dimension
-     * @tparam VectorOfVectorStructType
+     * 
+     * @tparam dimension 
+     * @tparam vector_vector_struct_type 
      * @param leaf_to_access  For each processor the leaf to access (for receiving or sending)
-     * @param nb_inputs
      * @param mpi_position_type  the MPI type of the coordinate of the points of the particles
      * @param mpi_input_type   the MPI type of the inputs of the particles
-     * @return std::vector<MPI_Datatype>
+     * @return std::vector<MPI_Datatype> 
      */
-    template<std::size_t Dimension, typename VectorOfVectorStructType>
-    inline auto build_mpi_particles_type(VectorOfVectorStructType const& leaf_to_access, int const nb_inputs,
+    template<std::size_t dimension, typename vector_vector_struct_type>
+    auto inline build_mpi_particles_type(vector_vector_struct_type const& leaf_to_access, int const nb_inputs,
                                          MPI_Datatype mpi_position_type,
                                          MPI_Datatype mpi_input_type) -> std::vector<MPI_Datatype>
     {
-        static constexpr std::size_t dimension = Dimension;
-
         const int nb_proc{int(leaf_to_access.size())};
         std::vector<MPI_Datatype> newtype(nb_proc);
 
@@ -271,6 +385,7 @@ namespace scalfmm::parallel::comm
         {
             if(leaf_to_access[p].size() != 0)
             {
+                // leaf_to_access[p] = std::vector<pair> [i] = (group_ptr, index_in_group)
                 auto const& elt = leaf_to_access[p];
                 int nb_mpi_types{int(elt.size() * (dimension + nb_inputs))};
                 std::vector<int> length(nb_mpi_types, 1);
@@ -308,11 +423,147 @@ namespace scalfmm::parallel::comm
                         type[i + stride * nb_elt] = type[i + dimension * nb_elt];
                         MPI_Get_address(&(leaf[0].inputs(k)), &disp[i + stride * nb_elt]);
                     }
+                    // std::cout << p << " " << leaf.csymbolics().morton_index << " nb part " << leaf.size() << " *ptr_x "
+                    //           << proxy_position << " snd part " << *(ptr_x + 1) << " inputs0: " << leaf[0].inputs()[0]
+                    //           << " inputs1: " << *(&(leaf[0].inputs()[0]) + 1) << "  ptr " << *(ptr_inputs_0 + 1)
+                    //           << std::endl;
                 }   // end loop on leaf_view
+                // std::cout << " create type " << std::endl;
+                // io::print("  " + std::to_string(p) + " disp", disp);
                 MPI_Type_create_struct(nb_mpi_types, length.data(), disp.data(), type.data(), &newtype[p]);
                 MPI_Type_commit(&newtype[p]);
+                // std::cout << "  send to " << p << " size " << size_msg << std::endl;
             }
         }
         return newtype;
     }
+    /// @brief Construct the MPI type of all multipoles to send to a different process
+    /// @tparam vector_vector_struct_type
+    /// @param cell_to_access
+    /// @param nb_inputs
+    /// @param mpi_multipole_type
+    /// @return A vector of MPI type
+    template<typename vector_vector_struct_type>
+    auto inline build_mpi_multipoles_type(vector_vector_struct_type const& cell_to_access, int const nb_inputs,
+                                          MPI_Datatype& mpi_multipole_type) -> std::vector<MPI_Datatype>
+    {
+        // std::cout << " build_mpi_multipoles_type inside nb_inputs" << nb_inputs << std::endl << std::flush;
+
+        const int nb_proc{int(cell_to_access.size())};
+        std::vector<MPI_Datatype> newtype(nb_proc, MPI_DATATYPE_NULL);
+
+        for(auto p = 0; p < nb_proc; ++p)
+        {
+            // std::clog << "   multipole type(p=" << p << ") nb cells to pack " << cell_to_access[p].size() << "\n";
+            // std::clog << std::flush;
+            if(cell_to_access[p].size() != 0)
+            {
+                auto const& elt = cell_to_access[p];
+                // number of mpi type to construct (=cells number * nb_inputs)
+                int nb_mpi_types{int(elt.size() * nb_inputs)};
+                //
+                std::vector<int> length(nb_mpi_types, 1);
+                std::vector<MPI_Aint> disp(nb_mpi_types);
+                std::vector<MPI_Datatype> type(nb_mpi_types);
+                // if(p == 3)
+                // {
+                //     //bug
+                //     for(auto i = 0; i < elt.size(); ++i)
+                //     {
+                //         std::clog << " ptr: " << *(elt[i].first) << " pos " << elt[i].second << std::endl;
+                //     }
+                // std::clog << "------\n";
+                // }
+                int size_msg{0};
+                for(auto i = 0; i < elt.size(); ++i)
+                {
+                    // *(elt[i].first) = ptr_group
+                    // elt[i].second = index inside group
+                    int jump{0};
+                    auto const& cell = (*(elt[i].first))->component(elt[i].second);
+                    // tuple of iterators
+                    // cell[0] return a particle proxy on the first particle
+                    auto const& m = cell.transfer_multipoles();
+                    auto nb_m = m.size();   // get number of multipoles = nb_inputs
+                    // std::cout << "          nb_m" << m.size() << std::endl << std::flush;
+                    for(std::size_t k{0}; k < nb_m; ++k)
+                    {
+                        auto const& ten = m.at(k);
+                        // std::cout << "     size " << int(ten.size()) << std::endl << std::flush;
+                        MPI_Type_contiguous(int(ten.size()), mpi_multipole_type, &type[i * nb_inputs + k]);
+                        //                    MPI_Get_address(&(ten.data()[0]), &disp[i * nb_inputs + k]);
+                        MPI_Get_address(ten.data(), &disp[i * nb_inputs + k]);
+                        // std::cout << "     i * nb_inputs + k " << i * nb_inputs + k << " nb_mpi_types " << nb_mpi_types
+                        //           << std::endl
+                        //           << std::flush;
+                    }
+                }   // end loop on cell_view
+                //	    io::print(std::clog, "m2l(type) disp: ", disp); std::clog << std::flush;
+                //	    io::print(std::clog, "m2l(type) length: ", length); std::clog << std::flush;
+
+                MPI_Type_create_struct(nb_mpi_types, length.data(), disp.data(), type.data(), &newtype[p]);
+                MPI_Type_commit(&newtype[p]);
+                //	    std::clog << std::flush;
+            }
+        }
+        return newtype;
+    }
+    template<typename vector_vector_struct_type>
+    auto inline build_mpi_multipoles_type2(vector_vector_struct_type const& cell_to_access, int const nb_inputs,
+                                           MPI_Datatype mpi_multipole_type) -> std::vector<MPI_Datatype>
+    {
+        const int nb_proc{int(cell_to_access.size())};
+        std::vector<MPI_Datatype> newtype(nb_proc, MPI_DATATYPE_NULL);
+
+        for(auto p = 0; p < nb_proc; ++p)
+        {
+            if(cell_to_access[p].size() != 0)
+            {
+                auto const& elt = cell_to_access[p];
+                // number of mpi type to construct (=cells number * nb_inputs)
+                int nb_mpi_types{int(elt.size() * nb_inputs)};
+                //
+                std::vector<int> length(nb_mpi_types);
+                std::vector<MPI_Aint> disp(nb_mpi_types);
+                std::vector<MPI_Datatype> type(nb_mpi_types, mpi_multipole_type);
+
+                int size_msg{0};
+                for(auto i = 0; i < elt.size(); ++i)
+                {
+                    // *(elt[i].first) = ptr_group
+                    // elt[i].second = index inside group
+                    int jump{0};
+                    auto const& cell = (*(elt[i].first))->component(elt[i].second);
+                    //
+                    auto const& m = cell.transfer_multipoles();
+                    auto nb_m = m.size();   // get number of mutilpoles = nb_inputs
+                    //			    std::cout << "          nb_m" <<  m.size() <<std::endl;
+                    for(std::size_t k{0}; k < nb_m; ++k)
+                    {
+                        auto const& ten = m.at(k);
+                        // std::cout << " size " << int(ten.size()) << std::endl;
+                        // MPI_Type_contiguous(int(ten.size()), mpi_multipole_type, &type[i * nb_inputs + k]);
+                        length[i * nb_inputs + k] = ten.size();
+                        MPI_Get_address(&(ten.data()[0]), &disp[i]);
+                    }
+                    // std::cout << p << " " << cell.csymbolics().morton_index << " nb part " << cell.size() << " *ptr_x "
+                    //           << proxy_position << " snd part " << *(ptr_x + 1) << " inputs0: " << cell[0].inputs()[0]
+                    //           << " inputs1: " << *(&(cell[0].inputs()[0]) + 1) << "  ptr " << *(ptr_inputs_0 + 1)
+                    //           << std::endl;
+                }   // end loop on cell_view
+                // std::cout << " create type " << std::endl;
+                // io::print("  " + std::to_string(p) + " disp", disp);
+                MPI_Type_create_struct(nb_mpi_types, length.data(), disp.data(), type.data(), &newtype[p]);
+                MPI_Type_commit(&newtype[p]);
+                // std::cout << "  send to " << p << " size " << size_msg << std::endl;
+            }
+        }
+        return newtype;
+    }
+
+    template<typename Tree>
+    void prepare_comm_up(Tree tree) {
+
+    };
+
 }   // namespace scalfmm::parallel::comm
diff --git a/include/scalfmm/parallel/mpi/utils.hpp b/include/scalfmm/parallel/mpi/utils.hpp
index caf7833ffe8f46aba293b4265c7794c74f18514a..81a810f655883ab31a63dcd9e641d823c490dfce 100644
--- a/include/scalfmm/parallel/mpi/utils.hpp
+++ b/include/scalfmm/parallel/mpi/utils.hpp
@@ -1,10 +1,11 @@
-// --------------------------------
-// See LICENCE file at project root
-// File : scalfmm/parallel/mpi/utils.hpp
-// --------------------------------
 #ifndef _PARALLEL_MPI_UTILS_HPP_
 #define _PARALLEL_MPI_UTILS_HPP_
 
+#include <algorithm>
+#include <fstream>
+#include <iostream>
+#include <vector>
+
 #include <cpp_tools/colors/colorized.hpp>
 #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 
@@ -15,25 +16,19 @@
 #include <inria/algorithm/distributed/mpi.hpp>
 #include <inria/linear_tree/balance_tree.hpp>
 #endif
-
-#include <algorithm>
-#include <fstream>
-#include <iostream>
-#include <vector>
-
 namespace scalfmm::parallel::utils
 {
 
     /**
      * @brief  print the distribution of components (cells/leaves) in a stream
      *
-     * @tparam VectorType
+     * @tparam Vector
      * @param out the stream
      * @param header the header to write
      * @param distrib
      */
-    template<typename VectorType>
-    inline auto print_distrib(std::ostream& out, std::string const& header, VectorType const& distrib) -> void
+    template<typename Vector>
+    auto inline print_distrib(std::ostream& out, std::string const& header, Vector const& distrib) -> void
     {
         out << header;
         for(auto p: distrib)
@@ -42,22 +37,20 @@ namespace scalfmm::parallel::utils
         }
         out << std::endl;
     }
-
     /**
      * @brief print the distribution of components (cells/leaves)
      *
-     * @tparam VectorType
+     * @tparam Vector
      * @param header the header to write
      * @param rank  the process id
      * @param distrib  the vector of distribution
      */
-    template<typename VectorType>
-    inline auto print_distrib(std::string const& header, int rank, VectorType const& distrib) -> void
+    template<typename Vector>
+    auto inline print_distrib(std::string const& header, int rank, Vector const& distrib) -> void
     {
         std::string new_header("rank(" + std::to_string(rank) + ") " + header);
         print_distrib(std::cout, new_header, distrib);
     }
-
     /**
      * @brief construct the morton indexes at the parent level
      *
@@ -69,31 +62,30 @@ namespace scalfmm::parallel::utils
      * @tparam VectorMortonIdx  the type of the vector of Morton index
      * @param[inout] leafMortonIdx  the vector of Morton index
      */
-    template<int Dimension, typename VectorMortonIdx>
-    inline auto move_index_to_upper_level(VectorMortonIdx& leafMortonIdx) -> void
+    template<int dimension, typename VectorMortonIdx>
+    auto inline move_index_to_upper_level(VectorMortonIdx& leafMortonIdx) -> void
     {   // Move leafMortonIdx to level level_shared
         for(auto& p: leafMortonIdx)
         {
-            p = p >> Dimension;
+            p = p >> dimension;
         }
         auto last = std::unique(leafMortonIdx.begin(), leafMortonIdx.end());
         leafMortonIdx.erase(last, leafMortonIdx.end());
     }
 
-    /**
-     * @brief send_get_min_morton_idx send Morton index to the left and get value from the right
-     *
-     * @tparam IndexType
-     * @param para
-     * @param[in] morton_idx the Morton index to send o send to processor p-1
-     * @return the Morton index coming from the right
-     */
-    template<typename IndexType>
-    [[nodiscard]] IndexType send_get_min_morton_idx(cpp_tools::parallel_manager::parallel_manager& para,
-                                                    IndexType& morton_idx)
+    ///
+    /// \brief send_get_min_morton_idx send Morton index to the left and get value from the right
+    ///
+    /// \param[in] conf  the mpi conf
+    /// \param[in] morton_idx the Morton index to send o send to processor p-1
+    /// \return the Morton index coming from the right
+    ///
+    template<typename index_type>
+    [[nodiscard]] index_type send_get_min_morton_idx(cpp_tools::parallel_manager::parallel_manager& para,
+                                                     index_type& morton_idx)
     {
         // Setting parameter
-        IndexType buff_recev{0};
+        index_type buff_recev{0};
 #ifdef SCALFMM_USE_MPI
         auto comm = para.get_communicator();
         int nb_proc = comm.size();
@@ -102,7 +94,7 @@ namespace scalfmm::parallel::utils
         if(nb_proc != 1)
         {
             cpp_tools::parallel_manager::mpi::request tab_mpi_status;
-            auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<IndexType>();
+            auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<index_type>();
 
             const int sender = (my_rank + 1 == nb_proc) ? MPI_PROC_NULL : my_rank + 1;
             const int receiver = (my_rank == 0) ? MPI_PROC_NULL : my_rank - 1;
@@ -122,24 +114,24 @@ namespace scalfmm::parallel::utils
     }
 #ifdef SCALFMM_USE_MPI
 
-    /**
-     * @brief exchange_data_left_right to exchange data left and right between processor left and right
-     *
-     * The processor p send data_left to processor p-1 and receive from it data_right and
-     *  p send data_right to processor p+1 and receive from it data_left
-     *
-     * @tparam DataType
-     * @param[in] conf
-     * @param[in] data_left data to send to processor left
-     * @param[in] data_right  data to send to processor right
-     * @return a tuple containing the value_right of processor on the left and the value left coming from processor right
-     */
-    template<typename DataType>
-    auto exchange_data_left_right(cpp_tools::parallel_manager::mpi_config& conf, DataType& data_left,
-                                  DataType& data_right)
+    ///
+    /// \brief exchange_data_left_right to exchange data left and right between processor left and right
+    ///
+    /// The processor p send data_left to processor p-1 and receive from it data_right and
+    ///  p send data_right to processor p+1 and receive from it data_left
+    /// \param[in] conf
+    /// \param[in] data_left data to send to processor left
+    /// \param[in] data_right  data to send to processor right
+    ///
+    /// \return a tuple containing the value_right of processor on the left and the
+    ///   value left coming from processor right
+    ///
+    template<typename data_type>
+    auto exchange_data_left_right(cpp_tools::parallel_manager::mpi_config& conf, data_type& data_left,
+                                  data_type& data_right)
     {
         // Setting parameter
-        DataType buff_p{0}, buff_n{0};
+        data_type buff_p{0}, buff_n{0};
         auto comm = conf.comm;
         int nb_proc = comm.size();
         int my_rank = comm.rank();
@@ -148,7 +140,7 @@ namespace scalfmm::parallel::utils
         {
             // First exchange to the left
             cpp_tools::parallel_manager::mpi::request tab_mpi_status[2];
-            //  auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<IndexType>();
+            //  auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<index_type>();
             // if i'm not the last proc
             const int right = (my_rank + 1 == nb_proc) ? MPI_PROC_NULL : my_rank + 1;
             const int left = (my_rank == 0) ? MPI_PROC_NULL : my_rank - 1;
@@ -170,28 +162,33 @@ namespace scalfmm::parallel::utils
     }
 #endif
 
-    /**
-     * @brief Distribute uniformly on the processes the leaves.
-     *
-     *  Split in interval (semi-open) the leaves
-     * The algorithm is
-     *  1) we distribute the particles according to their Morton index. The
-     *  the leaves are split on the processor by their Morton index
-     *
-     *  2) balanced the Morton index by some criteria to define
-     *
-     * @tparam MortonArrayType
-     * @param manager
-     * @param mortonArray
-     * @return auto
-     */
-    template<typename MortonArrayType>
-    auto balanced_leaves(cpp_tools::parallel_manager::parallel_manager& manager, MortonArrayType& mortonArray)
+    ///
+    /// \brief Distribute uniformly on the processes the leaves.
+    ///
+    ///  Split in interval (semi-open) the leaves
+    /// The algorithm is
+    ///  1) we distribute the particles according to their Morton index. The
+    ///  the leaves are split on the processor by their Morton index
+    ///
+    ///  2) balanced the Morton index by some criteria to define
+    ///
+    /// parameter[inout] mortonArray Morton index located on the processor
+    /// parameter[out] the distribution of leaves
+
+    template<typename MortonArray_type>
+    auto balanced_leaves(cpp_tools::parallel_manager::parallel_manager& manager, MortonArray_type& mortonArray)
     {
-        using morton_type = typename MortonArrayType::value_type;
+        //         std::cout << cpp_tools::colors::green << " --> Begin distrib::balanced_leaves  " << cpp_tools::colors::reset
+        //                   << std::endl;
+        //
+        using morton_type = typename MortonArray_type::value_type;
 
         auto rank = manager.get_communicator().rank();
-
+        // io::print("rank(" + std::to_string(rank) + ") leafMortonIdx: ", mortonArray);
+        // auto last = std::unique(mortonArray.begin(), mortonArray.end());
+        // mortonArray.erase(last, mortonArray.end());
+        //  io::print("rank(" + std::to_string(rank) + ") leafMortonIdx U: ", mortonArray);
+        //
         // get max and min of the Morton index owned by current  process
         // [min, max] On two consecutive processes we may have max[p] = min[p+1]
         // we remove such case
@@ -215,7 +212,7 @@ namespace scalfmm::parallel::utils
         ///  Construct a  uniform distribution of the Morton index
         ///
 
-        MortonArrayType morton_distrib;
+        MortonArray_type morton_distrib;
         try
         {
             inria::mpi_config conf_tmp(manager.get_communicator().raw_comm);
@@ -226,40 +223,54 @@ namespace scalfmm::parallel::utils
         {
             std::cerr << e.what() << '\n';
         }
+        //    print("rank(" + std::to_string(rank) + "morton_distrib ", morton_distrib);
+        //    manager.comm.barrier();
+
+        //            std::cout << "rank(" + std::to_string(rank) + ") Morton distrib  [" << morton_distrib[0] << ",
+        //            "
+        //                      << morton_distrib[morton_distrib.size() - 1] << "]\n";
+
+        //print("rank(" + std::to_string(rank) + ") Distrib cells Index: ", morton_distrib);
 
         cell_distrib.resize(manager.get_num_processes(), {0, 0});
         std::array<morton_type, 2> local{morton_distrib[0], morton_distrib[morton_distrib.size() - 1]};
         cell_distrib[0] = local;
+        //      print("rank(" + std::to_string(rank) + ") local: ", local);
         /// share the distribution on all processors
         manager.get_communicator().allgather(local.data(), sizeof(local), MPI_CHAR, cell_distrib.data(), sizeof(local),
                                              MPI_CHAR /*, 0*/);
 
 #endif
+        //        std::cout << cpp_tools::colors::red;
+        //         io::print("rank(" + std::to_string(rank) + ") cell_distrib: ", cell_distrib);
+
+        //         std::cout << cpp_tools::colors::green << " --> End distrib::balanced_leaves  " << cpp_tools::colors::reset
+        //                   << std::endl;
         return cell_distrib;
     }
-
-    /**
-     * @brief balanced_particles compute a balanced particle distribution
-     *
-     *  1) we distribute the particles according to their Morton index. The
-     *  the leaves
-     *   are split on the processor by their Morton index
-     *  2) balanced the Morton index by some criteria to define
-     *
-     * @tparam ParticleArrayType
-     * @tparam MortonArrayType
-     * @param manager the parallel manager.
-     * @param partArray the vector of particles own by the processor.
-     * @param morton_array the morton index located on the processor.
-     * @param number_of_particles the total number of particles on all processes.
-     * @return the distribution in terms of Morton index.
-     */
-    template<typename ParticleArrayType, typename MortonArrayType>
-    auto balanced_particles(cpp_tools::parallel_manager::parallel_manager& manager, ParticleArrayType& partArray,
-                            const MortonArrayType& morton_array, const std::size_t& number_of_particles)
+    ///
+    /// \brief balanced_particles compute a balanced particle distribution
+    ///
+    ///  1) we distribute the particles according to their Morton index. The
+    ///  the leaves
+    ///   are split on the processor by their Morton index
+    ///  2) balanced the Morton index by some criteria to define
+    ///
+    /// input[in] tha parallel manager
+    /// input[in] partArray the vector of particles own by the processor
+    /// input[inout] mortonArray the morton index located on the processor
+    /// input[in] number_of_particles the total number of particles on all processes
+    ///
+    /// \return the distribution in terms of Morton index (std::vector<std::array<Morton_type,2>>)
+    template<typename ParticleArray_type, typename MortonArray_type>
+    auto balanced_particles(cpp_tools::parallel_manager::parallel_manager& manager, ParticleArray_type& partArray,
+                            const MortonArray_type& morton_array, const std::size_t& number_of_particles)
     {
-        using Morton_type = typename MortonArrayType::value_type;
-        using MortonDistribType = std::array<int, 2>;
+        std::cout << cpp_tools::colors::green << " --> Begin distrib::balanced_particles  " << cpp_tools::colors::reset
+                  << std::endl;
+
+        using Morton_type = typename MortonArray_type::value_type;
+        using MortonDistrib_type = std::array<int, 2>;
 
         auto rank = manager.get_process_id();
         auto nb_proc = manager.get_num_processes();
@@ -270,21 +281,31 @@ namespace scalfmm::parallel::utils
         auto LeafMortonIndex(morton_array);
         auto last = std::unique(LeafMortonIndex.begin(), LeafMortonIndex.end());
         LeafMortonIndex.erase(last, LeafMortonIndex.end());
+        io::print(" LeafMortonIndex ", LeafMortonIndex);
         /// LeafMortonIndex has the size of the number of leaves
         /// weight = ({Morton index, number of particles}) for each leaf
-        std::vector<MortonDistribType> weight(LeafMortonIndex.size(), {bad_index, 0});
+        std::vector<MortonDistrib_type> weight(LeafMortonIndex.size(), {bad_index, 0});
         std::size_t pos = 0;
         weight[pos][0] = LeafMortonIndex[pos];
+        // std::cout << cpp_tools::colors::red << "leaf size: " << LeafMortonIndex.size() << std::endl;
         {   // loop on the number of particles
             for(std::size_t part = 0; part < morton_array.size(); ++part)
             {
+                //                    std::cout << "part " << part << " " <<
+                //                    tmp[part] << " pos " << pos << " " <<
+                //                    leafMortonIdx[pos]
+                //                              << "  " << weight[pos] <<
+                //                              std::endl;
                 while(morton_array[part] != LeafMortonIndex[pos])
                 {
+                    //                       std::cout << "  new pos " << pos <<
+                    //                       std::endl;
                     pos++;
                 }
                 weight[pos][1] += 1;
                 weight[pos][0] = LeafMortonIndex[pos];
             }
+            io::print("rank(" + std::to_string(rank) + ") weight: ", weight);
         }
 
         // get max and min of the Morton index owned by current process
@@ -297,14 +318,17 @@ namespace scalfmm::parallel::utils
         if(nb_proc == 1)
         {
             morton_distrib[0] = {minIndex[0], maxIndex[0]};
-            //     return morton_distrib;
+            // parallel::utils::print_distrib(std::cout, "distrib proc=1", morton_distrib);
+
+            return morton_distrib;
         }
+        io::print("rank(" + std::to_string(rank) + ") weight initial: ", weight);
 
 #ifdef SCALFMM_USE_MPI
 
         cpp_tools::parallel_manager::mpi_config conf(manager.get_communicator());
 
-        MortonDistribType weight_prev, weight_next;
+        MortonDistrib_type weight_prev, weight_next;
         std::tie(weight_prev, weight_next) = exchange_data_left_right(conf, minIndex, maxIndex);
 
         if(maxIndex[0] == weight_next[0])
@@ -316,6 +340,7 @@ namespace scalfmm::parallel::utils
             weight[0][1] += weight_prev[1];
         }
 
+        // io::print("rank(" + std::to_string(rank) + ") weight final: ", weight);
         ///
         /// compute the number of particles in the leaves
         int nb_part = 0;
@@ -332,15 +357,20 @@ namespace scalfmm::parallel::utils
         {
             block = number_of_particles - rank * block;
         }
+        std::cout << "rank(" << rank << ") N particles: " << nb_part << " block " << block << std::endl;
 
         std::array<Morton_type, 3> local{weight[0][0], weight[weight.size() - 1][0], nb_part};
         std::vector<std::array<Morton_type, 3>> part_distrib(nb_proc);
 
         part_distrib[0] = local;
 
+        //        io::print("rank(" + std::to_string(rank) + ") 0 Distrib cells Index: ", part_distrib);
+        //  std::cout << "rank(" << rank << ") local: " <<local[0]<<" " <<local[1]<<" " <<local[2] <<std::endl;
+
         /// share the distribution on all processors
         auto nb_elt = sizeof(local);
         conf.comm.allgather(local.data(), nb_elt, MPI_CHAR, part_distrib[0].data(), nb_elt, MPI_CHAR /*, 0*/);
+        // io::print("rank(" + std::to_string(rank) + ") Distrib cells Index: ", part_distrib);
         ///
         /// Try to have the same number of particles on a processor
         ///
@@ -356,7 +386,9 @@ namespace scalfmm::parallel::utils
             numberLeaves[i] = part_distrib[i][1] - part_distrib[i][0] + 1;
             maxLeaves = std::max(numberLeaves[i], maxLeaves);
         }
+        // io::print("rank(" + std::to_string(rank) + ") numberLeaves: ", numberLeaves);
 
+        // std::cout << "rank(" + std::to_string(rank) + ") initial tomove: " << maxLeaves << std::endl;
         /// Prevent to have 0 cell on a processor.
         if(maxLeaves > 1)
         {
@@ -365,6 +397,8 @@ namespace scalfmm::parallel::utils
                 tomove[i] = part_distrib[i][2] - block;
             }
         }
+        // io::print("rank(" + std::to_string(rank) + ") initial tomove: ", tomove);
+        //        if(rank == 0)
 
         for(int i = 0; i < nb_proc - 1; ++i)
         {
@@ -380,9 +414,27 @@ namespace scalfmm::parallel::utils
                 tomove[i] = 0;
                 tomove[i + 1] += tomove[i];
             }
+            //                    print("   end (" + std::to_string(i) + ")
+            //                    tomove: ", tomove); print("   end (" +
+            //                    std::to_string(i) + ") tosendR: ",
+            //                    tosendR); print("   end (" +
+            //                    std::to_string(i) + ") tosendL: ",
+            //                    tosendL);
         }
         tosendR[nb_proc - 1] = 0;
 
+        //                       io::print("rank(" + std::to_string(rank) + ") tomove: ", tomove);
+        //                        io::print("rank(" + std::to_string(rank) + ") tosendR: ", tosendR);
+        //                        io::print("rank(" + std::to_string(rank) + ") tosendRL: ", tosendL);
+        ///
+        //            std::cout << "tosendL(" + std::to_string(rank) + "): " << tosendL[rank] << std::endl;
+        //            std::cout << "tosendR(" + std::to_string(rank) + "): " << tosendR[rank] << std::endl;
+        //            if(rank > 0)
+        //                std::cout << "toReceivL(" + std::to_string(rank) + "): " << tosendR[rank - 1] <<
+        //                std::endl;
+        //            if(rank < nb_proc - 1)
+        //                std::cout << "toReceivR(" + std::to_string(rank) + "): " << tosendL[rank + 1] <<
+        //                std::endl;
         int toReceivL, toReceivR;
 
         toReceivL = tosendR[rank - 1] > 0 ? 1 : 0;
@@ -392,48 +444,76 @@ namespace scalfmm::parallel::utils
         ///
         int nb_leaf_to_left{0}, nb_leaf_to_right{0}, nb_part_to_left{0}, nb_part_to_right{0};
         Morton_type morton_to_left{0}, morton_to_right{0};
-        MortonDistribType MortonPart_to_left{{0, 0}}, MortonPart_to_right{{0, 0}};
+        MortonDistrib_type MortonPart_to_left{{0, 0}}, MortonPart_to_right{{0, 0}};
+        // std::cout << rank << " Morton [ " << MortonPart_to_left << ", " << MortonPart_to_right << "]" << std::endl;
 
         if(tosendL[rank] > 0)
         {
             int leaf_idx = 0;
             nb_part_to_left = weight[leaf_idx][1];
+            //     std::cout << " tosendL  leaf_idx " << leaf_idx << "  " << nb_part_to_left << std::endl;
 
             while(nb_part_to_left <= tosendL[rank])
             {
                 leaf_idx++;
                 nb_part_to_left += weight[leaf_idx][1];
+                // std::cout << "   tosendL  new pos " << leaf_idx << "  " << nb_part_to_left << std::endl;
             }
-            nb_leaf_to_left = leaf_idx + 1;
+
+            // nb_leaf_to_left = leaf_idx + 1;
+            nb_leaf_to_left = leaf_idx;
             morton_to_left = weight[leaf_idx][0];
             MortonPart_to_left = {weight[leaf_idx][0], nb_leaf_to_left};
             // New starting Morton index for the local distribution
-            local[0] = weight[leaf_idx + 1][0];
+            // local[0] = weight[leaf_idx + 1][0];   // Bug here ?
+            local[0] = weight[leaf_idx][0];   // Bug here ?
+
+            // std::cout << rank << " local[0] " << local[0] << std::endl;
+            // std::cout << rank << " send  morton_to_left" << morton_to_left << std::endl;
         }
 
         if(tosendR[rank] > 0)
         {
             int leaf_idx = weight.size() - 1;
             nb_part_to_right = weight[leaf_idx][1];
+            //   std::cout << "tosendR  leaf_idx " << leaf_idx << "  " << nb_part_to_right << std::endl;
 
             while(nb_part_to_right <= tosendL[rank])
             {
                 leaf_idx--;
                 nb_part_to_right += weight[leaf_idx][1];
+                //     std::cout << "   - tosendR  new pos " << leaf_idx << "  " << nb_part_to_right <<
+                //     std::endl;
             }
             nb_leaf_to_right = leaf_idx + 1;
             morton_to_right = weight[leaf_idx][0];
             MortonPart_to_right = {weight[leaf_idx][0], nb_leaf_to_left};
             // New starting Morton index for the local distribution
             local[1] = weight[leaf_idx][0];
+
+            // std::cout << rank << " send  " << nb_leaf_to_right << " leaf to right - nb  part " << nb_part_to_right
+            //           << "  " << MortonPart_to_right[0] << std::endl;
+            // std::cout << rank << "send  morton_to_right " << morton_to_right << std::endl;
         }
         local[3] = 0;
+        // std::cout << rank << " local partition [ " << local[0] << ", " << local[1] << "]" << std::endl;
 
         /// Send the number
         /// send to left and right
+        // int nb_elt_from_left{0}, nb_elt_from_right{0};
+        // Morton_type min_idx{part_distrib[rank][0]}, max_idx{part_distrib[rank][1]};
         Morton_type morton_from_left{local[0]}, morton_from_right{local[1]};
 
         /// receive from left right
+        //                auto exchange_val = [&manager, &rank,
+        //                &nb_proc, &tosendL, &tosendR, &toReceivL,
+        //                                     &toReceivR](const auto&
+        //                                     nb_part_to_left, const
+        //                                     auto& nb_part_to_right,
+        //                                                 auto&
+        //                                                 nb_elt_from_left,
+        //                                                 auto&
+        //                                                 nb_elt_from_right)
         {
             // compute the buffer size
 
@@ -468,38 +548,50 @@ namespace scalfmm::parallel::utils
             {
                 cpp_tools::parallel_manager::mpi::request::waitall(toReceivL + toReceivR, tab_mpi_status);
             }
+
+            // std::cout << rank << " Morton Left: " << morton_from_left << "  Morton right: " << morton_from_right
+            //           << std::endl;
         }
+
+        //    exchange_val(nb_part_to_left, nb_part_to_right,
+        //    nb_elt_from_left, nb_elt_from_right);
+
+        //   std::array<Morton_type, 3> local{weight[0][0], weight[weight.size() - 1][0], nb_part};
+        // io::print("rank(" + std::to_string(rank) + ") 00 Distrib cells Index: ", part_distrib);
+
         std::array<Morton_type, 2> local1 = {std::max(morton_from_left, part_distrib[rank][0]),
                                              std::min(morton_from_right, part_distrib[rank][1])};
 
+        // std::cout << rank << " final local 1 [ " << local1[0] << ", " << local1[1] << "]" << std::endl;
+
         morton_distrib[0] = local1;
 
+        // print("rank(" + std::to_string(rank) + ") Distrib cells Index: ", part_distrib);
+        // std::cout << "rank(" << rank << ") Distrib Leaf Index: " <<
+        // nb_part << std::endl;
+
         /// share the distribution on all processors
         nb_elt = sizeof(local1);
         conf.comm.allgather(local1.data(), nb_elt, MPI_CHAR, morton_distrib[0].data(), nb_elt, MPI_CHAR /*, 0*/);
+        // io::print("rank(" + std::to_string(rank) + ") Morton distrib final: ", morton_distrib);
 
 #endif
+        std::cout << cpp_tools::colors::green << " --> End distrib::balanced_particles  " << cpp_tools::colors::reset
+                  << std::endl;
         return morton_distrib;
     }
 
-    /**
-     * @brief
-     *
-     * @tparam ParticleArrayType
-     * @tparam MortonArrayType
-     * @tparam MortonDistribType
-     * @param my_rank
-     * @param particles
-     * @param morton_array
-     * @param morton_dist
-     * @return auto
-     */
-    template<typename ParticleArrayType, typename MortonArrayType, typename MortonDistribType>
-    auto compute_communications(int my_rank, ParticleArrayType& particles, const MortonArrayType& morton_array,
-                                const MortonDistribType& morton_dist)
+    template<typename ParticlesArray_type, typename MortonArray_type, typename MortonDistrib_type>
+    auto compute_communications(int my_rank, ParticlesArray_type& particles, const MortonArray_type& morton_array,
+                                const MortonDistrib_type& morton_dist)
     {
-        using Morton_type = typename MortonArrayType::value_type;
+        // std::cout << cpp_tools::colors::green << " --> Begin distrib::compute_communications  "
+        //           << cpp_tools::colors::reset << std::endl;
 
+        using Morton_type = typename MortonArray_type::value_type;
+        //            auto between = [](const Morton_type& m, const Morton_type& mmin, const Morton_type& mmax) {
+        //                return (mmin <= m) && (m <= mmax);
+        //            };
         std::vector<int> message(morton_dist.size());
         std::vector<std::array<int, 2>> details_partL(morton_dist.size(), {0, 0}),
           details_partR(morton_dist.size(), {0, 0});
@@ -508,7 +600,11 @@ namespace scalfmm::parallel::utils
         /// Compute on the left
         int pos = 0;
         bool new_start = true;
+        // std::string beg("rank(" + std::to_string(my_rank) + ")");
+        //    if(my_rank == 2)
         {
+            // io::print(beg + " morton_dist: ", morton_dist);
+
             for(std::size_t i = 0; i < particles.size(); ++i)
             {
                 if(morton_array[i] >= mortonMin)
@@ -531,14 +627,30 @@ namespace scalfmm::parallel::utils
                     new_start = false;
                 }
                 details_partL[pos][1] += 1;
+                //                    std::cout << beg << i << "    L  m_i " << morton_array[i] << " min " <<
+                //                    mortonMin << "  rank "
+                //                              << morton_dist[pos] << " " << std::boolalpha
+                //                              << between(morton_array[i], morton_dist[pos][0],
+                //                              morton_dist[pos][1]) << "  pos " << pos
+                //                              << std::endl;
             }
         }
         /// Compute on the right
+        //            print("rank(" + std::to_string(my_rank) + ") message: ", message);
+        //            print("rank(" + std::to_string(my_rank) + ") details_part: ", details_partL);
         {
+            //              print(beg + " morton_dist (Right): ", morton_dist);
             pos = morton_dist.size() - 1;
+            // my_rank + 1;
 
             for(std::size_t i = particles.size() - 1; i > 0; --i)
             {
+                //                    std::cout << beg << i << "   R   m_i " << morton_array[i] << " max " <<
+                //                    mortonMax << "  rank "
+                //                              << morton_dist[pos] << " " << std::boolalpha
+                //                              << between(morton_array[i], morton_dist[pos][0],
+                //                              morton_dist[pos][1]) << "  pos " << pos
+                //                              << std::endl;
                 if(morton_array[i] <= mortonMax)
                 {
                     break;
@@ -559,18 +671,23 @@ namespace scalfmm::parallel::utils
                     new_start = false;
                 }
                 details_partR[pos][1] += 1;
+                //                    std::cout << beg << i << "   R   m_i " << morton_array[i] << " max " <<
+                //                    mortonMax << "  rank "
+                //                              << morton_dist[pos] << " " << std::boolalpha
+                //                              << between(morton_array[i], morton_dist[pos][0],
+                //                              morton_dist[pos][1]) << "  pos " << pos
+                //                              << std::endl;
             }
+            //                print("rank(" + std::to_string(my_rank) + ") message: ", message);
+            //                print("rank(" + std::to_string(my_rank) + ") details_part R: ", details_partR);
         }
+        //	std::cout << cpp_tools::colors::green << " --> End distrib::compute_communications  "
+        //		  << cpp_tools::colors::reset << std::endl;
         return std::make_tuple(message, details_partL, details_partR);
     }
-
     /**
      * @brief
      *
-     * @tparam ParticleArrayType
-     * @tparam MortonArrayType
-     * @tparam MortonDistribType
-     * @tparam BoxType
      * @param manager
      * @param particles
      * @param morton_array
@@ -579,21 +696,30 @@ namespace scalfmm::parallel::utils
      * @param leaf_level
      * @param total_num_particles
      */
-    template<typename ParticleArrayType, typename MortonArrayType, typename MortonDistribType, typename BoxType>
-    auto fit_particles_in_distrib(cpp_tools::parallel_manager::parallel_manager& manager, ParticleArrayType& particles,
-                                  const MortonArrayType& morton_array, const MortonDistribType& morton_dist,
-                                  const BoxType& box, const int& leaf_level, const int& total_num_particles) -> void
+    template<typename ParticlesArray_type, typename MortonArray_type, typename MortonDistrib_type, typename Box_type>
+    void fit_particles_in_distrib(cpp_tools::parallel_manager::parallel_manager& manager,
+                                  ParticlesArray_type& particles, const MortonArray_type& morton_array,
+                                  const MortonDistrib_type& morton_dist, const Box_type& box, const int& leaf_level,
+                                  const int& total_num_particles)
     {
+        //     std::cout << cpp_tools::colors::green << " --> Begin distrib::fit_particles_in_distrib  "
+        //               << cpp_tools::colors::reset << std::endl;
         int my_rank = manager.get_process_id();
         int nb_proc = manager.get_num_processes();
+        // std::cout << "  (" << my_rank << ") size " << particles.size() << " "
+        //           << morton_array.size() << std::endl;
 #ifdef SCALFMM_USE_MPI
         auto comm = manager.get_communicator();
+        //            std::cout << "\n------------- fit_particles_in_distrib -------------" << std::endl;
+        // io::print("rank(" + std::to_string(my_rank) + ") morton_array: ", morton_array);
         // get the min and the max morton index of the particles own by the
         // process
         // send the number of communication we will receive
+        // auto mortonMin = morton_dist[my_rank][0];
+        // auto mortonMax = morton_dist[my_rank][1];
 
         auto to_comm = std::move(compute_communications(my_rank, particles, morton_array, morton_dist));
-
+        //  std::cout << "  (" << my_rank << ") " <<  std::get<0>(to_comm) << std::endl;
         // Send these numbers
         auto nb_message = std::get<0>(to_comm);
         auto nb_length_left = std::get<1>(to_comm);
@@ -602,39 +728,63 @@ namespace scalfmm::parallel::utils
 
         comm.allreduce(nb_message.data(), message_to_receiv.data(), nb_proc, MPI_INT, MPI_SUM);
 
+        //   print("rank(" + std::to_string(my_rank) + ") final message: ", message_to_receiv);
+
+        //
+        //            int nb_message_to_receiv =
+        //            message_to_receiv[my_rank];
         int buffer_size_left{0}, buffer_size_right{0};
         int nb_left = my_rank > 0 ? nb_length_left[my_rank - 1][1] : 0;
         int nb_right = my_rank + 1 != nb_proc ? nb_length_right[my_rank + 1][1] : 0;
 
         cpp_tools::parallel_manager::mpi_config conf(comm);
         std::tie(buffer_size_left, buffer_size_right) = exchange_data_left_right(conf, nb_left, nb_right);
-
+        //            std::cout << "rank(" + std::to_string(my_rank) + ")  nb_left: " << nb_left << std::endl;
+        //            std::cout << "rank(" + std::to_string(my_rank) + ")  nb_right: " << nb_right << std::endl;
+        //            std::cout << "rank(" + std::to_string(my_rank) + ")  buffer_size_left: " <<
+        //            buffer_size_left
+        //            << std::endl; std::cout << "rank(" + std::to_string(my_rank) + ")  buffer_size_right: " <<
+        //            buffer_size_right << std::endl;
+        ///
         /// Send the particles
         /// if nb_left >0 we send a communication on the left
         /// if nb_right >0 we send a communication on the right
         /// if buffer_size_left >0 we receive a communication on the left
         /// if buffer_size_right >0 we receive a communication on the right
-        using particle_type = typename ParticleArrayType::value_type;
+        using particle_type = typename ParticlesArray_type::value_type;
         particle_type *buffer_left{nullptr}, *buffer_right{nullptr};
         const int to_right = (my_rank + 1 == nb_proc) ? MPI_PROC_NULL : my_rank + 1;
         const int to_left = (my_rank == 0) ? MPI_PROC_NULL : my_rank - 1;
 
         if(nb_left > 0)
         {
+            //                std::cout << my_rank << " send first part to " << to_left << " nb val= " <<
+            //                nb_left << " first p "
+            //                          << particles[0] << std::endl;
+
             conf.comm.isend(particles.data(), nb_left * sizeof(particle_type), MPI_CHAR, to_left, 100);
         }
         if(nb_right > 0)
         {
             int start = particles.size() - nb_right;
+            //                std::cout << my_rank << " send last part to " << to_right << " nb val= " <<
+            //                nb_right
+            //                << " first p "
+            //                          << particles[start] << std::endl;
             conf.comm.isend(&(particles[start]), nb_right * sizeof(particle_type), MPI_CHAR, to_right, 100);
         }
         ///
         int nb_commL{(buffer_size_left > 0) ? 1 : 0}, nb_commR{(buffer_size_right > 0) ? 1 : 0};
         std::vector<cpp_tools::parallel_manager::mpi::request> tab_mpi_status;
+        //            buffer_right = new particle_type[buffer_size_right];
 
         if(nb_commL > 0)
         {
             buffer_left = new particle_type[buffer_size_left];
+            //                std::cout << my_rank << " post a receiv on left " << to_left << " b " <<
+            //                buffer_left
+            //                << " size "
+            //                          << buffer_size_left << std::endl;
 
             tab_mpi_status.push_back(
               conf.comm.irecv(buffer_left, buffer_size_left * sizeof(particle_type), MPI_CHAR, to_left, 100));
@@ -643,6 +793,10 @@ namespace scalfmm::parallel::utils
         {
             buffer_right = new particle_type[buffer_size_right];
 
+            //                std::cout << my_rank << " post a receiv on right " << to_right << " b " <<
+            //                buffer_right << " size "
+            //                          << buffer_size_right << " " << std::endl;
+
             tab_mpi_status.push_back(
               conf.comm.irecv(buffer_right, buffer_size_right * sizeof(particle_type), MPI_CHAR, to_right, 100));
         }
@@ -650,25 +804,36 @@ namespace scalfmm::parallel::utils
         // Prepare the copy during the communications
         //
         int new_part_size = particles.size() - nb_left - nb_right + buffer_size_left + buffer_size_right;
+        //   std::cout << my_rank << " old size " << particles.size() << " new size " << new_part_size <<
+        //   std::endl;
 
-        ParticleArrayType newArray(new_part_size);
+        ParticlesArray_type newArray(new_part_size);
         /// Here we copy in the right place the particles that do not move
         auto start = particles.begin() + nb_left /*std::advance(particles.begin(), nb_left)*/;
         auto end = particles.end() - nb_right /* std::advance(std::begin(particles), particles.size() - nb_right)*/;
         auto start_out = newArray.begin() + buffer_size_left /*std::advance(std::begin(newArray), buffer_size_left)*/;
         std::copy(start, end, start_out);
 
+        //       conf.comm.barrier();
+        //            std::cout << my_rank << " status size " << tab_mpi_status.size() << std::endl;
         if(tab_mpi_status.size() > 0)
         {
+            //                std::cout << my_rank << " I'm waiting  " << tab_mpi_status.size() << " " <<
+            //                buffer_left << "  "
+            //                          << buffer_right << std::endl;
             for(int j = 0; j < tab_mpi_status.size(); ++j)
             {
                 cpp_tools::parallel_manager::mpi::status status;
                 tab_mpi_status[j].get_status(status);
+                //                    std::cout << my_rank << " request " << j << "  count " <<
+                //                    status.get_count(MPI_CHAR) << " source "
+                //                              << status.source() << " tag " << status.tag() << std::endl;
             }
             cpp_tools::parallel_manager::mpi::request::waitall(tab_mpi_status.size(), tab_mpi_status.data());
         }
         conf.comm.barrier();
 
+        //            std::cout << my_rank << " ---------- End Redistribution ----------" << std::endl;
         if(buffer_left)
         {
             /// Here we copy in the right place the particles that do not move
@@ -703,21 +868,20 @@ namespace scalfmm::parallel::utils
 #else
         new_num_particles = particles.size();
 #endif
-    }
 
-    /**
-     * @brief Build the cell distribution at one level upper
-     *
-     * @tparam VectorMortonIdx
-     * @tparam MortonDistribution
-     * @param[in] para the parallel manager
-     * @param[in] dimension dimension of the problem.
-     * @param[in] level current level to construct the cell distribution
-     * @param[inout] mortonCellIndex the index cell at level+1 (in) and we construct the parent cells (out).
-     * @param ghost_l2l
-     * @param[in] cells_distrib at level + 1
-     * @return the cell distribution at level
-     */
+        // std::cout << cpp_tools::colors::green << " --> End distrib::fit_particles_in_distrib  "
+        //           << cpp_tools::colors::reset << std::endl;
+    }
+    ///
+    /// \brief Build the cell distribution at one level upper
+    /// \param[in]  para the parallel manager
+    /// \param[in] dimension of the problem
+    /// \param[inout] mortonCellIndex the index cell at level+1 (in) and we
+    ///  construct the parent cells (out)
+    /// \param[in] level current level to construct the cell distribution
+    /// \param[in] cellDistrib at level + 1
+    ///  \return the cell distribution at level
+    ///
     template<typename VectorMortonIdx, typename MortonDistribution>
     inline auto build_upper_distribution(cpp_tools::parallel_manager::parallel_manager& para,
                                          const std::size_t dimension, const int& level,
@@ -725,9 +889,14 @@ namespace scalfmm::parallel::utils
                                          const MortonDistribution& cells_distrib) -> MortonDistribution
     {
         using morton_type = typename VectorMortonIdx::value_type;
-
+        // std::cout << cpp_tools::colors::blue << " --> Begin distrib::build_upper_distribution at level " << level
+        //           << cpp_tools::colors::reset << std::endl;
+        // std::cout << std::endl;
         MortonDistribution parent_distrib(cells_distrib);
         auto rank = para.get_process_id();
+        // std::int64_t ghost_parent{-1};
+        //            io::print("rank(" + std::to_string(rank) + ") cells_distrib: ", cells_distrib);
+        //            io::print("rank(" + std::to_string(rank) + ") mortonCellIndex: ", mortonCellIndex);
 
         // get the parent distribution
         for(auto& p: parent_distrib)
@@ -828,6 +997,7 @@ namespace scalfmm::parallel::utils
             mortonCellIndex.erase(last, mortonCellIndex.end());
         }
 
+        // io::print("rank(" + std::to_string(rank) + ") mortonCellIndex1: ", mortonCellIndex);
         parent_distrib[0][0] = parent_distrib[rank][0];
         parent_distrib[0][1] = parent_distrib[rank][1];
         auto mpi_type = cpp_tools::parallel_manager::mpi::get_datatype<morton_type>();
@@ -835,47 +1005,66 @@ namespace scalfmm::parallel::utils
         /// share the distribution on all processors
         para.get_communicator().allgather(parent_distrib.data(), 2, mpi_type);
 
+        // print_distrib("parent_distrib(allgather):", rank, parent_distrib);
+
+        // std::cout << cpp_tools::colors::blue << " --> End distrib::build_upper_distribution at level " << level
+        //           << cpp_tools::colors::reset << std::endl;
         return parent_distrib;
     }
 
-    /**
-     * @brief Merges two sorted vectors.
-     *
-     * Elements appear only once.
-     *
-     * @tparam VectorMortonIdx
-     * @param v1 first vector to merge.
-     * @param v2 vector to merge.
-     * @return the merged vector to the first vector.
-     */
+    ///
+    /// \brief merge two sorted vectors
+    ///
+    /// Elements appear only once
+    ///
+    ///  \param[in] v1 first vector to merge
+    ///  \param[in] v2 vector to merge
+    ///
+    /// \return the merged vector
+    /// to the first vector
+    ///
     template<typename VectorMortonIdx>
-    inline auto merge_unique(VectorMortonIdx& v1, const VectorMortonIdx& v2) -> VectorMortonIdx
+    inline VectorMortonIdx merge_unique(VectorMortonIdx& v1, const VectorMortonIdx& v2)
     {
+        /*            std::cout << cpp_tools::colors::green << " --> Begin let::merge_unique  " <<
+           cpp_tools::colors::reset
+                              << std::endl*/
         VectorMortonIdx dst;
         std::merge(v1.begin(), v1.end(), v2.begin(), v2.end(), std::back_inserter(dst));
         auto last = std::unique(dst.begin(), dst.end());
+        //	std::cout << "  last "  << *last <<std::endl;
         dst.erase(last, dst.end());
-
+        //            std::cout << cpp_tools::colors::green << " --> End let::merge_unique  " <<
+        //            cpp_tools::colors::reset
+        //                      << std::endl;
+        //	io::print(" merge uniq dst", dst);
         return dst;
     }
 
-    /**
-     * @brief
-     *
-     * @tparam VectorMortonIdx
-     * @tparam Iterator
-     * @param a_beg
-     * @param a_end
-     * @param b_beg
-     * @param b_end
-     * @return VectorMortonIdx
-     */
     template<typename VectorMortonIdx, typename Iterator>
-    auto merge_unique_fast(const Iterator a_beg, const Iterator a_end, const Iterator b_beg,
-                           const Iterator b_end) -> VectorMortonIdx
+    VectorMortonIdx merge_unique_fast(const Iterator a_beg, const Iterator a_end, const Iterator b_beg,
+                                      const Iterator b_end)
     {
-        int j{0};
+        // io::print(std::cout, " merge uniq v1", a_beg, a_end);
+        // std::cout << std::endl;
+        // io::print(std::cout, " merge uniq v2", b_beg, b_end);
+        // std::cout << std::endl;
         int n = std::distance(b_beg, b_end);
+        int m = std::distance(a_beg, a_end);
+
+        if(m == 0)
+        {
+            VectorMortonIdx merged(n);
+            std::copy(b_beg, b_end, merged.begin());
+            return merged;
+        }
+        if(n == 0)
+        {
+            VectorMortonIdx merged(m);
+            std::copy(a_beg, a_end, merged.begin());
+            return merged;
+        }
+        int j{0};
         std::vector<int> add(n);
         int nb_elt_to_add{0};
         Iterator it_a{a_beg}, it_b{b_beg};
@@ -900,9 +1089,9 @@ namespace scalfmm::parallel::utils
                 ++it_a;
             }
         }
-        n = std::distance(a_beg, a_end);
+        // n = std::distance(a_beg, a_end);
 
-        VectorMortonIdx merged(n + nb_elt_to_add, -1);
+        VectorMortonIdx merged(m + nb_elt_to_add, -1);
 
         it_a = a_beg;
         it_b = b_beg;
@@ -931,22 +1120,19 @@ namespace scalfmm::parallel::utils
             }
         }
         std::copy(it_a, a_end, it);
-        //	io::print("merged ", merged);
+        // io::print("merged ", merged);
         return merged;
     }
 
-    /**
-     * @brief find if the index exists owning the index
-     *
-     * @tparam MortonIdx
-     * @tparam VectorMortonIdx
-     * @param index
-     * @param my_index
-     * @param[in] start [optional] position to start in the distribution vector
-     * @return the process number (if -1 index not in my distribution)
-     */
+    ///
+    /// \brief find if the index exists owning the index
+    /// \param[in] index
+    /// \param[in] distrib the index distribution
+    /// \param[in] start [optional] position to start in the distribution
+    /// vector \return the process number (if -1 index not in my distribution)
+    ///
     template<typename MortonIdx, typename VectorMortonIdx>
-    inline auto find_index(const MortonIdx& index, const VectorMortonIdx& my_index, std::size_t& start) -> std::int64_t
+    inline std::int64_t find_index(const MortonIdx& index, const VectorMortonIdx& my_index, std::size_t& start)
     {
         for(std::size_t i = start; i < my_index.size(); ++i)
         {
@@ -968,19 +1154,8 @@ namespace scalfmm::parallel::utils
         }
         return -1;
     }
-
-    /**
-     * @brief
-     *
-     * @tparam MortonIdx
-     * @tparam LeafInfo
-     * @param index
-     * @param my_leaves
-     * @param start
-     * @return std::int64_t
-     */
     template<typename MortonIdx, typename LeafInfo>
-    inline auto find_index2(const MortonIdx& index, const LeafInfo& my_leaves, std::size_t& start) -> std::int64_t
+    inline std::int64_t find_index2(const MortonIdx& index, const LeafInfo& my_leaves, std::size_t& start)
     {
         for(std::size_t i = start; i < my_leaves.size(); ++i)
         {
@@ -1002,22 +1177,18 @@ namespace scalfmm::parallel::utils
         }
         return -1;
     }
-
-    /**
-     * @brief check if the morton index used in the vector of indexes exist
-     *
-     *  This step needs communication
-     *
-     * @tparam VectorMortonIdx
-     * @tparam MortonDistribution
-     * @param para the parallel manager
-     * @param needed_idx the index to check if they exits in the other processors
-     * @param distrib the index distribution on all processors
-     * @param local_morton_idx My local morton index
-     */
+    ///
+    /// \brief check if the morton index used in the vector of indexes exist
+    ///
+    ///  This step needs communication
+    /// \param para the parallel manager
+    /// \param needed_idx the index to check if they exits in the other processors
+    /// \param distrib the index distribution on all processors
+    /// \param local_morton_idx My local morton index
+    ///
     template<typename VectorMortonIdx, typename MortonDistribution>
-    auto check_if_morton_index_exist(cpp_tools::parallel_manager::parallel_manager& para, VectorMortonIdx& needed_idx,
-                                     const MortonDistribution& distrib, const VectorMortonIdx& local_morton_idx) -> void
+    void check_if_morton_index_exist(cpp_tools::parallel_manager::parallel_manager& para, VectorMortonIdx& needed_idx,
+                                     const MortonDistribution& distrib, const VectorMortonIdx& local_morton_idx)
     {
         auto rank = para.get_process_id();
         auto nb_proc = para.get_num_processes();
@@ -1184,30 +1355,23 @@ namespace scalfmm::parallel::utils
         // We remove the bad_index in order to have only the existing components (leaf/cell)
         std::sort(needed_idx.begin(), needed_idx.end());
         auto last = std::unique(needed_idx.begin(), needed_idx.end());
-
+        //   io::print("rank(" + std::to_string(rank) + ") uniq needed_idx : ", needed_idx);
         if(*(last - 1) == bad_index)
         {
             last = last - 1;
         }
-
         needed_idx.erase(last, needed_idx.end());
+        // // io::print("rank(" + std::to_string(rank) + ") needed_idx : ", needed_idx);
+        // std::cout << cpp_tools::colors::green << " (" << rank << ") --> End distrib::check_if_morton_index_exist
+        // "
+        //           << cpp_tools::colors::reset << std::endl
+        //           << std::flush;
     }
 
-    /**
-     * @brief
-     *
-     * @tparam VectorMortonIdx
-     * @tparam leafInfo
-     * @tparam MortonDistribution
-     * @param para
-     * @param needed_idx
-     * @param distrib
-     * @param leaf_info
-     */
     template<typename VectorMortonIdx, typename leafInfo, typename MortonDistribution>
-    auto check_if_leaf_morton_index_exist(cpp_tools::parallel_manager::parallel_manager& para,
+    void check_if_leaf_morton_index_exist(cpp_tools::parallel_manager::parallel_manager& para,
                                           VectorMortonIdx& needed_idx, const MortonDistribution& distrib,
-                                          const leafInfo& leaf_info) -> void
+                                          const leafInfo& leaf_info)
     {
         auto rank = para.get_process_id();
         auto nb_proc = para.get_num_processes();
@@ -1381,21 +1545,24 @@ namespace scalfmm::parallel::utils
         }
 
 #endif
+
+        // io::print("rank(" + std::to_string(rank) + ") needed_idx : ", needed_idx);
+        // std::cout << cpp_tools::colors::green << " (" << rank
+        //           << ") --> End distrib::check_if_leaf_morton_index_exist " << cpp_tools::colors::reset <<
+        //           std::endl
+        //           << std::flush;
     }
 
-    /**
-     * @brief find the group owning the index
-     *
-     * @tparam MortonIdx
-     * @tparam GroupIteratorType
-     * @param[in] begin iterator to start search
-     * @param[in] end iterator to complete the search vector
-     * @param[in] index the index
-     * @return GroupIteratorType
-     */
-    template<typename MortonIdx, typename GroupIteratorType>
-    inline auto find_group_for_index(GroupIteratorType begin, GroupIteratorType end,
-                                     const MortonIdx& index) -> GroupIteratorType
+    ///
+    /// \brief find the group owning the index
+    ///
+    /// \param[in] index the index
+    /// \param[in] begin iterator to start search
+    /// \param[in] end iterator to complete the search
+    /// vector \return the process number
+    ///
+    template<typename MortonIdx, typename Group_iterator_t>
+    inline Group_iterator_t find_group_for_index(Group_iterator_t begin, Group_iterator_t end, const MortonIdx& index)
     {
         for(auto grp_ptr = begin; grp_ptr != end; ++grp_ptr)
         {
@@ -1408,22 +1575,20 @@ namespace scalfmm::parallel::utils
         }
         return end;
     }
-
-    /**
-     * @brief
-     *
-     * @tparam GroupIteratorType
-     * @tparam MortonIdxVectorType
-     * @tparam Dependencies_t
-     * @param begin
-     * @param end
-     * @param morton_to_send
-     * @param deps
-     */
-    template<typename GroupIteratorType, typename MortonIdxVectorType, typename Dependencies_t>
-    inline auto build_dependencies_from_morton_vector(GroupIteratorType begin, GroupIteratorType end,
-                                                      const MortonIdxVectorType& morton_to_send,
-                                                      Dependencies_t& deps) -> void
+    /// @brief  Build the vector of dependencies
+    ///
+    ///       Build the vector of dependencies by adding the first address of the multipoles inside all groups
+    ///        between begin and end
+    /// @tparam Group_iterator_t
+    /// @tparam MortonIdxVector_t
+    /// @tparam Dependencies_t
+    /// @param begin begin iterator on group
+    /// @param end   end iterator on group
+    /// @param morton_to_send the Morton index of the cell to send
+    /// @param deps the vector of dependencies
+    template<typename Group_iterator_t, typename MortonIdxVector_t, typename Dependencies_t>
+    void build_multipoles_dependencies_from_morton_vector(Group_iterator_t begin, Group_iterator_t end,
+                                                          const MortonIdxVector_t& morton_to_send, Dependencies_t& deps)
     {
         const int max_idx = morton_to_send.size();   // loop on the groups
         // Find the group containing the first index
@@ -1448,20 +1613,9 @@ namespace scalfmm::parallel::utils
             }
         }
     }
-
-    /**
-     * @brief
-     *
-     * @tparam GroupIteratorType
-     * @tparam MortonIdxVectorType
-     * @param first
-     * @param second
-     * @param mortons
-     * @return auto
-     */
-    template<typename GroupIteratorType, typename MortonIdxVectorType>
-    auto serialise(std::pair<GroupIteratorType, GroupIteratorType> first,
-                   std::pair<GroupIteratorType, GroupIteratorType> second, const MortonIdxVectorType& mortons)
+    template<typename Group_iterator_t, typename MortonIdxVector_t>
+    auto serialise(std::pair<Group_iterator_t, Group_iterator_t> first,
+                   std::pair<Group_iterator_t, Group_iterator_t> second, const MortonIdxVector_t& mortons)
     {
     }
 
diff --git a/include/scalfmm/tools/fma_dist_loader.hpp b/include/scalfmm/tools/fma_dist_loader.hpp
index 7fadf8c4a6679f259b654ae49720e16938005932..28be4b05dd7f37dbdbd973116e6092044c411f9c 100644
--- a/include/scalfmm/tools/fma_dist_loader.hpp
+++ b/include/scalfmm/tools/fma_dist_loader.hpp
@@ -34,35 +34,14 @@ namespace scalfmm::io
         using FFmaGenericLoader<FReal, Dimension>::m_verbose;
         using MPI_Offset = std::size_t;
 
-        /**
-         * @brief Number of particles that the calling process will manage.
-         *
-         */
-        std::size_t m_local_number_of_particles;
+        std::size_t m_local_number_of_particles;   ///<  Number of particles that the calling process will manage.
 
-        /**
-         * @brief
-         *
-         */
-        MPI_Offset m_idxParticles;
+        std::size_t m_start;   ///<
 
-        /**
-         * @brief number of my first parts in file.
-         *
-         */
-        std::size_t m_start;
+        size_t m_headerSize;   ///< size of the header in byte
 
-        /**
-         * @brief
-         *
-         */
-        size_t m_headerSize;
-
-        /**
-         * @brief
-         *
-         */
-        const cpp_tools::parallel_manager::parallel_manager* m_parallelManager;
+        const cpp_tools::parallel_manager::parallel_manager* m_parallelManager;   ///< a pointer on the parallel manager
+        // MPI_Offset m_idxParticles;                 ///<
 
       public:
         /**
@@ -76,7 +55,7 @@ namespace scalfmm::io
                              const bool verbose = false)
           : FFmaGenericLoader<FReal, Dimension>(inFilename, verbose)
           , m_local_number_of_particles(0)
-          , m_idxParticles(0)
+          //   , m_idxParticles(0)
           , m_headerSize(0)
           , m_parallelManager(&para)
         {
@@ -176,35 +155,15 @@ namespace scalfmm::io
     class DistFmaGenericWriter : public FFmaGenericWriter<FReal>
     {
       protected:
-        /**
-         * @brief
-         *
-         */
-        const cpp_tools::parallel_manager::parallel_manager* m_parallelManager;
+        const cpp_tools::parallel_manager::parallel_manager* m_parallelManager;   ///<
 
-        /**
-         * @brief
-         *
-         */
-        bool _writeDone;
+        // bool _writeDone; ///<
 
-        /**
-         * @brief
-         *
-         */
-        int m_headerSize;
+        int m_headerSize;   ///< size of the header in byte
 
-        /**
-         * @brief number of data to write for one particle.
-         *
-         */
-        int _nbDataTowritePerRecord;
+        int m_nbDataTowritePerRecord;   ///<   number of data to write for one particle.
 
-        /**
-         * @brief number of particle (global) to write in the file.
-         *
-         */
-        std::size_t _numberOfParticles;
+        std::size_t m_numberOfParticles;   ///< number of particle (global) to write in the file.
 
         using FFmaGenericWriter<FReal>::m_file;
 #ifdef SCALFMM_USE_MPI
@@ -215,6 +174,8 @@ namespace scalfmm::io
         MPI_File _mpiFile;
 #endif
       public:
+        using base_type = FFmaGenericWriter<FReal>;
+        using value_type = FReal;
         /**
          * @brief Construct a new Dist Fma Generic Writer object
          *
@@ -226,18 +187,22 @@ namespace scalfmm::io
          * @param inFilename the name of the file to open.
          * @param para
          */
-        DistFmaGenericWriter(const std::string inFilename, const cpp_tools::parallel_manager::parallel_manager& para)
-          : FFmaGenericWriter<FReal>(inFilename)
+        DistFmaGenericWriter(const std::string inFilename, cpp_tools::parallel_manager::parallel_manager const& para,
+                             const bool verbose = false)
+          : FFmaGenericWriter<FReal>(inFilename, verbose)
           , m_parallelManager(&para)
-          , _writeDone(false)
+          //   , _writeDone(false)
           , m_headerSize(0)
-          , _nbDataTowritePerRecord(8)
-          , _numberOfParticles(0)
+          , m_nbDataTowritePerRecord(8)
+          , m_numberOfParticles(0)
         {
 #ifdef SCALFMM_USE_MPI
             if(!this->isBinary())
             {
-                std::cout << "DistFmaGenericWriter only works with binary file (.bfma)." << std::endl;
+                if(para.io_master())
+                {
+                    std::cout << "DistFmaGenericWriter only works with binary file (.bfma)." << std::endl;
+                }
                 std::exit(EXIT_FAILURE);
             }
             auto comm = m_parallelManager->get_communicator();
@@ -247,7 +212,7 @@ namespace scalfmm::io
             // Is it open?
             if(fileIsOpen != MPI_SUCCESS)
             {
-                std::cerr << "Cannot create parallel file, DistFmaGenericWriter constructeur abort." << std::endl;
+                std::cerr << "Cannot create parallel file, DistFmaGenericWriter constructor abort." << std::endl;
                 std::exit(EXIT_FAILURE);
                 return;
             }
@@ -281,19 +246,20 @@ namespace scalfmm::io
             std::array<unsigned int, 4> typeFReal = {dataType, nbDataPerRecord, dimension, nb_input_values};
             FReal x = boxWidth * FReal(0.5);
             m_headerSize = 0;
-            _nbDataTowritePerRecord = nbDataPerRecord;
-            _numberOfParticles = nbParticles;
+            m_nbDataTowritePerRecord = nbDataPerRecord;
+            m_numberOfParticles = nbParticles;
             if(m_parallelManager->master())
             {
-                FFmaGenericWriter<FReal>::writerBinaryHeader(centerOfBox, boxWidth, nbParticles, typeFReal.data(), 4);
-                std::cout << "centerOfBox " << centerOfBox << " boxWidth " << boxWidth << " nbParticles " << nbParticles
-                          << " dataType " << dataType << " nbDataPerRecord " << nbDataPerRecord << " dimension "
-                          << dimension << " nb_input_values " << nb_input_values << std::endl;
+                auto half_Box_width = boxWidth * value_type(0.5);
+                base_type::writerBinaryHeader(centerOfBox, half_Box_width, nbParticles, typeFReal.data(), 4);
+                // std::cout << "centerOfBox " << centerOfBox << " half_Box_width " << half_Box_width << " nbParticles "
+                //           << nbParticles << " dataType " << dataType << " nbDataPerRecord " << nbDataPerRecord
+                //           << " dimension " << dimension << " nb_input_values " << nb_input_values << std::endl;
 #ifdef SCALFMM_USE_MPI
-                for(auto a: typeFReal)
-                {
-                    std::cout << "typeFReal " << a << std::endl;
-                }
+                // for(auto a: typeFReal)
+                // {
+                //     std::cout << "typeFReal " << a << std::endl;
+                // }
                 int sizeType = 0;
                 int ierr = 0;
                 auto mpiInt64 = cpp_tools::parallel_manager::mpi::get_datatype<std::size_t>();
@@ -324,8 +290,8 @@ namespace scalfmm::io
                 MPI_Type_size(mpiReal, &sizeType);
                 m_headerSize += sizeType * (1 + PointType::dimension);
                 // Build the header offset
-                std::cout << " headerSize " << m_headerSize << std::endl;
-                FFmaGenericWriter<FReal>::close();
+                // std::cout << " headerSize " << m_headerSize << std::endl;
+                FFmaGenericWriter<FReal>::close();   // Why ?????
 #endif
             }
 #ifdef SCALFMM_USE_MPI
@@ -333,7 +299,7 @@ namespace scalfmm::io
 
             comm.bcast(&m_headerSize, 1, MPI_INT, 0);
             //  MPI_Bcast(&_headerSize, 1, MPI_INT, 0, m_parallelManager->global().getComm());
-            std::cout << "  _headerSize  " << m_headerSize << std::endl;
+            // std::cout << "  _headerSize  " << m_headerSize << std::endl;
 #endif
 
             //  MPI_File_close(&_mpiFile);
@@ -349,111 +315,94 @@ namespace scalfmm::io
          * @brief Write all for all particles the position, physical values, potential and forces.
          *
          * @tparam TreeType
-         * @param myOctree the octree
+         * @param tree the octree
          * @param nbParticles number of particles.
          */
         template<typename TreeType>
-        auto writeFromTree(const TreeType& myOctree, const std::size_t& nbParticles) -> void
+        auto writeFromTree(const TreeType& tree, std::size_t const& nbParticles) -> void
         {
-            //            //
-            //            // Write the header
-            //            int sizeType = 0, ierr = 0;
-            //            FReal tt = 0.0;
-            //            MPI_Datatype mpistd::size_t_t = m_parallelManager->GetType(nbParticles);
-            //            MPI_Datatype mpiFReal_t = m_parallelManager->GetType(tt);
-            //            MPI_Type_size(mpiFReal_t, &sizeType);
-            //            int myRank = m_parallelManager->global().processId();
-            //            _headerSize = 0;
-            //            //
-            //            unsigned int typeFReal[2] = {sizeof(FReal), static_cast<unsigned
-            //            int>(_nbDataTowritePerRecord)}; if(myRank == 0)
-            //            {
-            //                ierr = MPI_File_write_at(_mpiFile, 0, &typeFReal, 2, MPI_INT, MPI_STATUS_IGNORE);
-            //            }
-            //            MPI_Type_size(MPI_INT, &sizeType);
-            //            _headerSize += sizeType * 2;
-            //            if(myRank == 0)
-            //            {
-            //                ierr = MPI_File_write_at(_mpiFile, _headerSize, &nbParticles, 1, mpistd::size_t_t,
-            //                MPI_STATUS_IGNORE);
-            //            }
-            //            MPI_Type_size(mpistd::size_t_t, &sizeType);
-            //            _headerSize += sizeType * 1;
-            //            auto centerOfBox = myOctree.getBoxCenter();
-            //            FReal boxSim[4] = {myOctree.getBoxWidth() * 0.5, centerOfBox.getX(), centerOfBox.getX(),
-            //                               centerOfBox.getX()};
-
-            //            if(myRank == 0)
-            //            {
-            //                ierr = MPI_File_write_at(_mpiFile, _headerSize, &boxSim[0], 4, mpiFReal_t,
-            //                MPI_STATUS_IGNORE);
-            //            }
-            //            if(ierr > 0)
-            //            {
-            //                std::cerr << "Error during the construction of the header in "
-            //                             "FMpiFmaGenericWriter::writeDistributionOfParticlesFromOctree"
-            //                          << std::endl;
-            //            }
-            //            MPI_Type_size(mpiFReal_t, &sizeType);
-            //            _headerSize += sizeType * 4;
-            //            //
-            //            // Construct the local number of particles on my process
-            //            std::size_t nbLocalParticles = 0, maxPartLeaf = 0;
-            //            MortonIndex starIndex = mortonLeafDistribution[2 * myRank],
-            //                        endIndex = mortonLeafDistribution[2 * myRank + 1];
-            //            myOctree.template forEachCellLeaf<typename TreeType::LeafClass_T>(
-            //              [&](typename TreeType::GroupSymbolCellClass_T* gsymb,
-            //                  typename TreeType::GroupCellUpClass_T* /* gmul */,
-            //                  typename TreeType::GroupCellDownClass_T* /* gloc */,
-            //                  typename TreeType::LeafClass_T* leafTarget) {
-            //                  if(!(gsymb->getMortonIndex() < starIndex || gsymb->getMortonIndex() > endIndex))
-            //                  {
-            //                      auto n = leafTarget->getNbParticles();
-            //                      nbLocalParticles += n;
-            //                      maxPartLeaf = std::max(maxPartLeaf, n);
-            //                  }
-            //              });
-            //            std::vector<FReal> particles(maxPartLeaf * _nbDataTowritePerRecord);
-            //            // Build the offset for eaxh processes
-            //            std::size_t before = 0;   // Number of particles before me (rank < myrank)
-            //            MPI_Scan(&nbLocalParticles, &before, 1, mpistd::size_t_t, MPI_SUM,
-            //            m_parallelManager->global().getComm()); before -= nbLocalParticles; MPI_Offset offset =
-            //            _headerSize + sizeType * _nbDataTowritePerRecord * before;
-            //            //
-            //            // Write particles in file
-            //            myOctree.template forEachCellLeaf<typename TreeType::LeafClass_T>(
-            //              [&](typename TreeType::GroupSymbolCellClass_T* gsymb,
-            //                  typename TreeType::GroupCellUpClass_T* /* gmul */,
-            //                  typename TreeType::GroupCellDownClass_T* /* gloc */,
-            //                  typename TreeType::LeafClass_T* leafTarget) {
-            //                  if(!(gsymb->getMortonIndex() < starIndex || gsymb->getMortonIndex() > endIndex))
-            //                  {
-            //                      const std::size_t nbPartsInLeaf = leafTarget->getNbParticles();
-            //                      const FReal* const posX = leafTarget->getPositions()[0];
-            //                      const FReal* const posY = leafTarget->getPositions()[1];
-            //                      const FReal* const posZ = leafTarget->getPositions()[2];
-            //                      const FReal* const physicalValues = leafTarget->getPhysicalValues();
-            //                      const FReal* const forceX = leafTarget->getForcesX();
-            //                      const FReal* const forceY = leafTarget->getForcesY();
-            //                      const FReal* const forceZ = leafTarget->getForcesZ();
-            //                      const FReal* const potential = leafTarget->getPotentials();
-            //                      for(int i = 0, k = 0; i < nbPartsInLeaf; ++i, k += _nbDataTowritePerRecord)
-            //                      {
-            //                          particles[k] = posX[i];
-            //                          particles[k + 1] = posY[i];
-            //                          particles[k + 2] = posZ[i];
-            //                          particles[k + 3] = physicalValues[i];
-            //                          particles[k + 4] = potential[i];
-            //                          particles[k + 5] = forceX[i];
-            //                          particles[k + 6] = forceY[i];
-            //                          particles[k + 7] = forceZ[i];
-            //                      }
-            //                      MPI_File_write_at(_mpiFile, offset, particles.data(),
-            //                                        static_cast<int>(_nbDataTowritePerRecord * nbPartsInLeaf),
-            //                                        mpiFReal_t, MPI_STATUS_IGNORE);
-            //                      offset += sizeType * _nbDataTowritePerRecord * nbPartsInLeaf;
-            //                  }
-            //              });
+            // The header is already written
+            static constexpr int dimension = TreeType::dimension;
+            using int64 = int;
+
+            int maxPartLeaf{0}, nbLocalParticles{0};
+            scalfmm::component::for_each_mine_leaf(tree.begin_mine_leaves(), tree.end_mine_leaves(),
+                                                   [&nbLocalParticles, &maxPartLeaf](auto& leaf)
+                                                   {
+                                                       auto n = static_cast<int>(leaf.size());
+                                                       nbLocalParticles += n;
+                                                       maxPartLeaf = std::max(maxPartLeaf, n);
+                                                       ;
+                                                   });
+            // Build the offset for each processes
+            int64 before{0};   // Number of particles before me (rank < myrank)
+            int sizeRealType{0}, ierr{0};
+
+            auto mpiReal = cpp_tools::parallel_manager::mpi::get_datatype<FReal>();
+            MPI_Type_size(mpiReal, &sizeRealType);
+            MPI_Datatype mpiInt64 = cpp_tools::parallel_manager::mpi::get_datatype<int64>();
+
+            // #ifdef SCALFMM_USE_MPI
+            auto comm = m_parallelManager->get_communicator();
+            MPI_Scan(&nbLocalParticles, &before, 1, mpiInt64, MPI_SUM, comm);
+            before -= nbLocalParticles;
+            // std::cout << " nbLocalParticles  " << nbLocalParticles << " maxPartLeaf " << maxPartLeaf << " before "
+            //           << before << std::endl
+            //           << std::flush;
+            //
+            MPI_Offset offset = m_headerSize + sizeRealType * m_nbDataTowritePerRecord * before;
+            // std::cout << " offset to write part  " << offset << std::endl;
+            //
+            // Write particles in file
+
+            using value_type = typename TreeType::leaf_type::value_type;
+            static constexpr int nb_elt_per_par =
+              dimension + TreeType::particle_type::inputs_size + TreeType::particle_type::outputs_size;
+            // std::cout << "nb_elt_per_par " << nb_elt_per_par << std::endl;
+            using particles_t = std::array<value_type, nb_elt_per_par>;
+            std::vector<particles_t> particles(nbLocalParticles);
+            // std::vector<FReal> particles(maxPartLeaf * m_nbDataTowritePerRecord);
+
+            //
+            scalfmm::component::for_each_mine_leaf(
+              tree.begin_mine_leaves(), tree.end_mine_leaves(),
+              [this, &offset, &particles, &sizeRealType, &mpiReal](auto& leaf)
+              {
+                  int pos = 0;
+                  auto nbPartsInLeaf = leaf.size();
+                  //   std::cout << " leaf index " << leaf.index() << " nbpart " << leaf.size() << std::endl;
+                  for(auto const& it_p: leaf)
+                  {
+                      auto& particles_elem = particles[pos++];
+                      const auto& p = typename TreeType::leaf_type::particle_type(it_p);
+                      //
+                      int i = 0;
+                      const auto points = p.position();
+                      for(int k = 0; k < dimension; ++k, ++i)
+                      {
+                          particles_elem[i] = points[k];
+                      }
+                      // get inputs
+                      for(int k = 0; k < TreeType::particle_type::inputs_size; ++k, ++i)
+                      {
+                          particles_elem[i] = p.inputs(k);
+                      }
+                      // get outputs
+                      for(int k = 0; k < TreeType::particle_type::outputs_size; ++k, ++i)
+                      {
+                          particles_elem[i] = p.outputs(k);
+                      }
+                      //   std::cout << "      " << pos << " part  " << particles_elem << std::endl;
+                  }
+                  //   std::cout << " write to  ptr_data " << particles.data() << " size "
+                  //             << static_cast<int>(m_nbDataTowritePerRecord * nbPartsInLeaf) << std::endl;
+                  MPI_File_write_at(_mpiFile, offset, particles.data(),
+                                    static_cast<int>(m_nbDataTowritePerRecord * nbPartsInLeaf), mpiReal,
+                                    MPI_STATUS_IGNORE);
+                  offset += sizeRealType * m_nbDataTowritePerRecord * nbPartsInLeaf;
+                  //   std::cout << " next offset to write part  " << offset << std::endl;
+                  //   std::cout << std::endl << std::flush;
+              });
 
 #ifdef SCALFMM_USE_MPI
             MPI_File_close(&_mpiFile);
diff --git a/include/scalfmm/tools/fma_loader.hpp b/include/scalfmm/tools/fma_loader.hpp
index 5c99f314c83c07fb10baf47599990cc6d77fe212..0cd0ba123b0945836ced7868f889b00ec024266b 100644
--- a/include/scalfmm/tools/fma_loader.hpp
+++ b/include/scalfmm/tools/fma_loader.hpp
@@ -76,60 +76,16 @@ namespace scalfmm::io
     class FFmaGenericLoader
     {
       protected:
-        /**
-         * @brief the stream used to read the file.
-         *
-         */
-        std::fstream* m_file;
-
-        /**
-         * @brief if true the file to read is in binary mode.
-         *
-         */
-        bool m_binaryFile;
-
-        /**
-         * @brief the center of box (read from file).
-         *
-         */
-        container::point<FReal, Dimension> m_centerOfBox;
-
-        /**
-         * @brief the center of box (read from file).
-         *
-         */
-        std::vector<FReal> m_center{};
-
-        /**
-         * @brief the box width (read from file).
-         *
-         */
-        FReal m_boxWidth;
-
-        /**
-         * @brief the number of particles (read from file)
-         *
-         */
-        std::size_t m_nbParticles;
-
-        /**
-         * @brief Size of the data to read, number of data on 1 line,
-         * dimension of space and number of input values
-         *
-         */
-        std::array<unsigned int, 4> m_typeData;
-
-        /**
-         * @brief file name containung the data.
-         *
-         */
-        std::string m_filename;
-
-        /**
-         * @brief Verbose mode.
-         *
-         */
-        bool m_verbose;
+        std::fstream* m_file;                               ///< the stream used to read the file
+        bool m_binaryFile;                                  ///< if true the file to read is in binary mode
+        container::point<FReal, Dimension> m_centerOfBox;   ///< The center of box (read from file)
+        std::vector<FReal> m_center{};                      ///< The center of box (read from file)
+        FReal m_boxWidth;                                   ///< the box width (read from file)
+        std::size_t m_nbParticles;                          ///< the number of particles (read from file)
+        std::array<unsigned int, 4> m_typeData;   ///< Size of the data to read, number of data on 1 line, dimension
+                                                  ///< of space and number of input values
+        std::string m_filename;                   ///< file name containung the data
+        bool m_verbose;                           ///<  Verbose mode
 
       private:
         /**
@@ -250,25 +206,31 @@ namespace scalfmm::io
 
         /**
          * @brief To know if the file is open and ready to read.
-         *
-         * @return true
-         * @return false
+         * @return true if loader can work
          */
-        inline auto isOpen() const -> bool { return this->m_file->is_open() && !this->m_file->eof(); }
+        bool isOpen() const { return this->m_file->is_open() && !this->m_file->eof(); }
 
         /**
          * @brief To get the number of particles from this loader
-         *
-         * @return std::size_t
          */
-        inline auto getNumberOfParticles() const -> std::size_t { return this->getParticleCount(); }
+        std::size_t getNumberOfParticles() const { return this->getParticleCount(); }
+        /**
+         *  @brief To get the number of particles from this loader
+         */
+        std::size_t getMyNumberOfParticles() const { return this->getParticleCount(); }
 
         /**
          * @brief To get the center of the box from the simulation file opened by the loader.
          *
+         *  @return box center (type Point)
          */
         inline auto getCenterOfBox() const -> container::point<FReal, Dimension> { return this->getBoxCenter(); }
-
+        /**
+         * @brief Get the center of the box contining the particles
+         *
+         * @return A point (ontainer::point<FReal>) representing the box center
+         */
+        inline auto getBoxCenter() const { return this->m_centerOfBox; }
         /**
          * @brief Returns a pointer on the element of the Box center.
          *
@@ -283,13 +245,6 @@ namespace scalfmm::io
          */
         inline auto getParticleCount() const -> std::size_t { return this->m_nbParticles; }
 
-        /**
-         * @brief Get the center of the box contining the particles
-         *
-         * @return A point (ontainer::point<FReal>) representing the box center
-         */
-        inline auto getBoxCenter() const { return this->m_centerOfBox; }
-
         /**
          * @brief box width from the simulation file opened by the loader
          *
@@ -620,10 +575,13 @@ namespace scalfmm::io
          *
          * @param filename the name of the file to open.
          */
-        FFmaGenericWriter(const std::string& filename)
+        FFmaGenericWriter(std::string const& filename, const bool verbose = true)
           : m_binaryFile(false)
         {
-            std::cout << "FFmaGenericWriter filename " << filename << std::endl;
+            if(verbose)
+            {
+                std::cout << "FFmaGenericWriter filename " << filename << std::endl;
+            }
             std::string ext(".bfma");
             // open particle file
             if(filename.find(".bfma") != std::string::npos)
@@ -649,44 +607,40 @@ namespace scalfmm::io
                 std::cerr << "File " << filename << " not opened! " << std::endl;
                 std::exit(EXIT_FAILURE);
             }
-            std::cout << "FFmaGenericWriter file " << filename << " opened" << std::endl;
-        }
-
-        /**
-         * @brief Construct a new FFmaGenericWriter object
-         *
-         * This constructor opens a file to be written to.
-         *
-         * @param filename the name of the file to open.
-         * @param binary   true if the file to open is in binary mode
-         */
-        FFmaGenericWriter(const std::string& filename, const bool binary)
-          : m_file(nullptr)
-          , m_binaryFile(binary)
-        {
-            if(binary)
-            {
-                this->m_file = new std::fstream(filename.c_str(), std::ifstream::out | std::ios::binary);
-            }
-            else
-            {
-                this->m_file = new std::fstream(filename.c_str(), std::ifstream::out);
-                this->m_file->precision(std::numeric_limits<FReal>::digits10);
-            }
-            // test if open
-            if(!this->m_file->is_open())
+            if(verbose)
             {
-                std::cerr << "File " << filename << " not opened! " << std::endl;
-                std::exit(EXIT_FAILURE);
+                std::cout << "FFmaGenericWriter file " << filename << " opened" << std::endl;
             }
-            std::cout << "FFmaGenericWriter file " << filename << " opened" << std::endl;
         }
 
-        /**
-         * @brief
-         *
-         */
-        inline auto close() -> void
+        // /**
+        //  * This constructor opens a file to be written to.
+        //  *
+        //  * @param filename the name of the file to open.
+        //  * @param binary   true if the file to open is in binary mode
+        //  */
+        // FFmaGenericWriter(const std::string& filename, const bool binary)
+        //   : m_file(nullptr)
+        //   , m_binaryFile(binary)
+        // {
+        //     if(binary)
+        //     {
+        //         this->m_file = new std::fstream(filename.c_str(), std::ifstream::out | std::ios::binary);
+        //     }
+        //     else
+        //     {
+        //         this->m_file = new std::fstream(filename.c_str(), std::ifstream::out);
+        //         this->m_file->precision(std::numeric_limits<FReal>::digits10);
+        //     }
+        //     // test if open
+        //     if(!this->m_file->is_open())
+        //     {
+        //         std::cerr << "File " << filename << " not opened! " << std::endl;
+        //         std::exit(EXIT_FAILURE);
+        //     }
+        //     std::cout << "FFmaGenericWriter file " << filename << " opened" << std::endl;
+        // }
+        void close()
         {
             m_file->close();
             delete m_file;
@@ -911,32 +865,33 @@ namespace scalfmm::io
             std::vector<particles_t> particles(number_particles);
             //
             int pos = 0;
-            scalfmm::component::for_each_leaf(std::cbegin(tree), std::cend(tree),
-                                              [&pos, &particles](auto& leaf)
-                                              {
-                                                  for(auto const& it_p: leaf)
-                                                  {
-                                                      auto& particles_elem = particles[pos++];
-                                                      const auto& p = typename TreeType::leaf_type::particle_type(it_p);
-                                                      //
-                                                      int i = 0;
-                                                      const auto points = p.position();
-                                                      for(int k = 0; k < dimension; ++k, ++i)
-                                                      {
-                                                          particles_elem[i] = points[k];
-                                                      }
-                                                      // get inputs
-                                                      for(int k = 0; k < nb_input_elements; ++k, ++i)
-                                                      {
-                                                          particles_elem[i] = p.inputs(k);
-                                                      }
-                                                      // get outputs
-                                                      for(int k = 0; k < nb_output_elements; ++k, ++i)
-                                                      {
-                                                          particles_elem[i] = p.outputs(k);
-                                                      }
-                                                  }
-                                              });
+            scalfmm::component::for_each_mine_leaf(tree.begin_mine_leaves(), tree.end_mine_leaves(),
+                                                   [&pos, &particles](auto& leaf)
+                                                   {
+                                                       for(auto const& it_p: leaf)
+                                                       {
+                                                           auto& particles_elem = particles[pos++];
+                                                           const auto& p =
+                                                             typename TreeType::leaf_type::particle_type(it_p);
+                                                           //
+                                                           int i = 0;
+                                                           const auto points = p.position();
+                                                           for(int k = 0; k < dimension; ++k, ++i)
+                                                           {
+                                                               particles_elem[i] = points[k];
+                                                           }
+                                                           // get inputs
+                                                           for(int k = 0; k < nb_input_elements; ++k, ++i)
+                                                           {
+                                                               particles_elem[i] = p.inputs(k);
+                                                           }
+                                                           // get outputs
+                                                           for(int k = 0; k < nb_output_elements; ++k, ++i)
+                                                           {
+                                                               particles_elem[i] = p.outputs(k);
+                                                           }
+                                                       }
+                                                   });
             //
             // write the particles
             const auto& centre = tree.box_center();
diff --git a/include/scalfmm/tree/dist_group_tree.hpp b/include/scalfmm/tree/dist_group_tree.hpp
index e7d1b40af66b9c07e08a06b5bfc6693f77b92c86..fae3d95612a757ff0705013da707aa120f8b6930 100644
--- a/include/scalfmm/tree/dist_group_tree.hpp
+++ b/include/scalfmm/tree/dist_group_tree.hpp
@@ -6,13 +6,6 @@
 #ifndef SCALFMM_TREE_DIST_GROUP_TREE_HPP
 #define SCALFMM_TREE_DIST_GROUP_TREE_HPP
 
-#include "scalfmm/tree/box.hpp"
-#include "scalfmm/tree/group_let.hpp"
-#include "scalfmm/tree/group_tree_view.hpp"
-#include "scalfmm/utils/io_helpers.hpp"
-
-#include <cpp_tools/colors/colorized.hpp>
-
 #include <array>
 #include <fstream>
 #include <iostream>
@@ -21,6 +14,16 @@
 #include <utility>
 #include <vector>
 
+#include "scalfmm/tree/box.hpp"
+#include <scalfmm/tree/group_let.hpp>
+#include <scalfmm/tree/group_tree_view.hpp>
+#include <scalfmm/utils/io_helpers.hpp>
+
+#include <scalfmm/parallel/comm_access.hpp>
+
+#include <cpp_tools/colors/colorized.hpp>
+#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+
 namespace scalfmm::component
 {
     /**
@@ -40,9 +43,13 @@ namespace scalfmm::component
         using base_type = group_tree_view<Cell, Leaf, Box>;
         using leaf_iterator_type = typename base_type::leaf_iterator_type;
         using const_leaf_iterator_type = typename base_type::const_leaf_iterator_type;
+        using cell_iterator_type = typename base_type::cell_iterator_type;
         using cell_group_level_iterator_type = typename base_type::cell_group_level_type::iterator;
         using iterator_type = typename base_type::iterator_type;
         using const_iterator_type = typename base_type::const_iterator_type;
+        using grp_access_type = std::pair<cell_group_level_iterator_type, int>;
+
+        static constexpr std::size_t dimension = base_type::box_type::dimension;
 
         /**
          * @brief Construct a new dist group tree object
@@ -61,9 +68,29 @@ namespace scalfmm::component
           : base_type(tree_height, order, size_leaf_blocking, size_cell_blocking, box)
           , m_parallel_manager(parallel_manager)
           , m_level_shared{level_shared}
+          , transfer_acces(tree_height, parallel_manager.get_num_processes())
 
         {
             m_cell_distrib.resize(tree_height);
+            up_down_acces.resize(tree_height);
+
+            // transfer_acces.m_receive_multipoles_type.resize(tree_height);
+            // transfer_acces.m_send_multipoles_type.resize(tree_height);
+            // transfer_acces.m_send_morton.resize(tree_height);
+            // for(auto& vec: transfer_acces.m_send_morton)
+            // {
+            //     vec.resize(parallel_manager.get_num_processes());
+            // }
+            // m_receive_cells_access.resize(tree_height);
+            // for(auto& vec: transfer_acces.m_receive_cells_access)
+            // {
+            //     vec.resize(parallel_manager.get_num_processes());
+            // }
+            // m_send_cells_access.resize(tree_height);
+            // for(auto& vec: transfer_acces.m_send_cells_access)
+            // {
+            //     vec.resize(parallel_manager.get_num_processes());
+            // }
         }
 
         // template<typename ParticleContainer>
@@ -267,9 +294,10 @@ namespace scalfmm::component
                                    VectorMortonIndexType const& ghosts_m2l, VectorMortonIndexType const& ghosts_m2m,
                                    const std::int64_t& ghost_l2l, data_distrib_value_type const& cell_distrib) -> void
         {
-            //io::print("create_cells_at_level mortonIdx", mortonIdx);
-            //io::print("ghosts_m2l", ghosts_m2l);
-            //io::print("ghosts_m2m", ghosts_m2m);
+            io::print(std::clog, "create_cells_at_level mortonIdx", mortonIdx);
+            io::print(std::clog, "ghosts_m2l", ghosts_m2l);
+            io::print(std::clog, "ghosts_m2m", ghosts_m2m);
+            std::clog << std::endl << std::flush;
 
             // construct group of cells at leaf level
             auto first_index = cell_distrib[0];
@@ -288,20 +316,23 @@ namespace scalfmm::component
                 ghost_left_mortonIdx.back() = ghost_l2l;
             }
 
-            //io::print("create_from_leaf : ghost_left_mortonIdx ", ghost_left_mortonIdx);
+            // io::print("create_from_leaf : ghost_left_mortonIdx ", ghost_left_mortonIdx);
             this->build_groups_of_cells_at_level(ghost_left_mortonIdx, level, false);
             this->build_cells_in_groups_at_level(ghost_left_mortonIdx, base_type::m_box, level);
 
-            //io::print("ghost_left_mortonIdx ", ghost_left_mortonIdx);
+            // io::print("ghost_left_mortonIdx ", ghost_left_mortonIdx);
+            // std::cout << std::endl << std::flush;
 
             auto left_block_cells = std::move(base_type::m_group_of_cell_per_level.at(level));
+            // std::cout << " merge_unique_fast " << std::endl << std::flush;
             auto ghost_right_mortonIdx = scalfmm::parallel::utils::merge_unique_fast<VectorMortonIndexType>(
               last, ghosts_m2l.end(), ghosts_m2m.begin(), ghosts_m2m.end());
 
-            //io::print("create_from_leaf : ghost_right_mortonIdx ", ghost_right_mortonIdx);
+            // io::print("create_from_leaf : ghost_right_mortonIdx ", ghost_right_mortonIdx);
+            std::cout << std::endl << std::flush;
+
             this->build_groups_of_cells_at_level(ghost_right_mortonIdx, level, false);
             this->build_cells_in_groups_at_level(ghost_right_mortonIdx, base_type::m_box, level);
-
             auto right_block_cells = std::move(base_type::m_group_of_cell_per_level.at(level));
             this->build_groups_of_cells_at_level(mortonIdx, level);
             this->build_cells_in_groups_at_level(mortonIdx, base_type::m_box, level);
@@ -309,15 +340,17 @@ namespace scalfmm::component
             auto local_block_cells = std::move(base_type::m_group_of_cell_per_level.at(level));
             auto all_cells_blocks =
               scalfmm::tree::let::merge_blocs(left_block_cells, local_block_cells, right_block_cells);
-            // std::cout << "  All cells blocks at level " << level << " size: " << all_cells_blocks.size() <<
-            // std::endl; int tt{0}; for(auto pg: all_cells_blocks)
+            // std::cout << "  All cells blocks at level " << level << " size: " << all_cells_blocks.size() << std::endl
+            //           << std::flush;
+            // int tt{0};
+            // for(auto pg: all_cells_blocks)
             // {
             //     std::cout << "block index " << tt++ << " ";
             //     pg->print();
-            //     std::cout << std::endl;
+            //     std::cout << std::endl << std::flush;
             //     // pg->cstorage().print_block_data(std::cout);
             // }
-            // std::cout << std::endl;
+            // std::cout << std::endl << std::flush;
             base_type::m_group_of_cell_per_level.at(level) = std::move(all_cells_blocks);
             auto& grp_level = base_type::m_group_of_cell_per_level.at(level);
             int idx{0};
@@ -407,8 +440,10 @@ namespace scalfmm::component
 
             auto leaf_level = base_type::m_tree_height - 1;
             std::int64_t ghost_l2l_cell{-1};
+            // std::cout << "       this->create_cells_at_level  \n" << std::flush;
 
             this->create_cells_at_level(leaf_level, mortonIdx, ghosts_m2l, ghosts_m2m, ghost_l2l_cell, cell_distrib);
+            // std::cout << " END create_from_leaf_level \n" << std::flush;
             //
         }
 
@@ -600,12 +635,13 @@ namespace scalfmm::component
         template<typename ParticleContainer>
         auto fill_leaves_with_particles(ParticleContainer const& particle_container) -> void
         {
+#ifndef TOTO
             //	  using scalfmm::details::tuple_helper;
             // using proxy_type = typename particle_type::proxy_type;
             // using const_proxy_type = typename particle_type::const_proxy_type;
             // using outputs_value_type = typename particle_type::outputs_value_type;
             auto begin_container = std::begin(particle_container);
-            std::size_t group_index{0};
+            std::size_t group_index{0}, part_src_index{0};
             // std::cout << cpp_tools::colors::red << " particle_container.size() " << particle_container.size()
             //           << std::endl;
             // std::cout << " nb of mine grp "
@@ -615,21 +651,21 @@ namespace scalfmm::component
             for(auto pg = base_type::cbegin_mine_leaves(); pg != base_type::cend_mine_leaves(); ++pg)
             {
                 auto& group = *(pg->get());
-                std::size_t part_src_index{0};
 
                 std::size_t leaf_index{0};
                 auto leaves_view = group.components();
                 // loop on leaves
-                for(auto const& leaf: group.components())
+                for(auto const& leaf: leaves_view)
                 {
                     // get the leaf container
                     auto leaf_container_begin = leaf.particles().first;
-                    // std::cout << " nb part in leaf " << leaf.index() << " leaf.size() " << leaf.size() << std::endl
+                    // std::cout << " leaf index " << leaf.index() << " leaf.size() " << leaf.size() << std::endl
                     //           << std::flush;
                     // copy the particle in the leaf
                     for(std::size_t index_part = 0; index_part < leaf.size(); ++index_part)
                     {
-                        // std::cout << " index_part " << index_part << " part_src_index " << part_src_index << std::endl
+                        // std::cout << " --> index_part " << index_part << " part_src_index " << part_src_index
+                        //           << std::endl
                         //           << std::flush;
                         // get the source index in the source container
                         // auto source_index = std::get<1>(tuple_of_indexes.at(part_src_index));
@@ -639,7 +675,8 @@ namespace scalfmm::component
                         // copy the particle
 
                         // *leaf_container_begin = particle_container.particle(source_index).as_tuple();
-                        // std::cout << part_src_index << " p " << particle_container.at(part_src_index) << std::endl;
+                        // std::cout << " -->            " << part_src_index << " p "
+                        // << particle_container.at(part_src_index) << std::endl;
                         *leaf_container_begin = particle_container.at(part_src_index).as_tuple();
 
                         //         proxy_type particle_in_leaf(*leaf_container_begin);
@@ -657,16 +694,17 @@ namespace scalfmm::component
                 ++group_index;
                 // std::cout << " group " << group << std::endl;
             }
-            // #ifdef _DEBUG_BLOCK_DATA
-            //             std::clog << "  FINAL block\n";
-            //             int tt{0};
-            //             for(auto pg: m_group_of_leaf)
-            //             {
-            //                 std::clog << "block index " << tt++ << std::endl;
-            //                 pg->cstorage().print_block_data(std::clog);
-            //             }
-            //             std::clog << "  ---------------------------------------------------\n";
-            // #endif
+#ifdef _DEBUG_BLOCK_DATA
+            std::clog << "  FINAL block\n";
+            int tt{0};
+            for(auto pg: m_group_of_leaf)
+            {
+                std::clog << "block index " << tt++ << std::endl;
+                pg->cstorage().print_block_data(std::clog);
+            }
+            std::clog << "  ---------------------------------------------------\n";
+#endif
+#endif
         }
 
         /**
@@ -679,6 +717,56 @@ namespace scalfmm::component
             return m_parallel_manager;
         }
 
+        auto inline get_send_multipole_types(const int& level) -> std::vector<MPI_Datatype>&
+        {
+            return transfer_acces.get_send_multipole_types(level);
+        }
+
+        // auto inline print_send_multipole_types(const int& level) -> void
+        // {
+        //     auto const& type = m_send_multipoles_type[level];
+        //     for(int p = 0; p < type.size(); ++p)
+        //     {
+        //         std::cout << "   ptr_data_type(" << p << ") " << &(type[p]) << " level: " << level << std::endl
+        //                   << std::flush;
+        //     }
+        // }
+        auto inline get_receive_multipole_types(const int& level) -> std::vector<MPI_Datatype>&
+        {
+            return transfer_acces.get_receive_multipole_types(level);
+        }
+        auto inline send_morton_indexes(int const& level, int const& proc) -> std::vector<morton_type>&
+        {
+            return transfer_acces.send_morton_indexes(level, proc);
+        }
+        auto inline send_morton_indexes(int const& level) -> std::vector<std::vector<morton_type>>&
+        {
+            return transfer_acces.send_morton_indexes(level);
+        }
+        auto inline receive_cells_access(int const& level) -> std::vector<std::vector<grp_access_type>>&
+        {
+            return transfer_acces.receive_cells_access(level);
+        }
+
+        auto inline send_cells_access(int const& level) -> std::vector<std::vector<grp_access_type>>&
+        {
+            return transfer_acces.send_cells_access(level);
+        }
+        // auto inline set_receive_access(const int level, std::vector<std::vector<grp_access_type>>& access) -> void
+        // {
+        //     m_receive_access[level] = std::move(access);
+        // }
+        // auto inline get_ptr_send_access(const int& level) -> grp_access_type*
+        // {
+        //     return m_send_multipoles_type[level].data();
+        // }
+        // auto inline get_ptr_receive_access(const int& level) -> grp_access_type*
+        // {
+        //     return m_receive_multipoles_type[level].data();
+        // }
+        auto get_up_down_access(const int level) -> UpDownDataAccess& { return up_down_acces[level]; }
+        auto get_up_down_access(const int level) const -> UpDownDataAccess const& { return up_down_acces[level]; }
+
       private:
         /**
          * @brief a reference on the parallel manager
@@ -703,6 +791,18 @@ namespace scalfmm::component
          *
          */
         int m_level_shared;
+        ///
+        // Structure for communications in transfert step
+
+        // std::vector<std::vector<MPI_Datatype>> m_send_multipoles_type;
+        // std::vector<std::vector<MPI_Datatype>> m_receive_multipoles_type;
+        // // vector per level and per process
+        // std::vector<std::vector<std::vector<morton_type>>> m_send_morton;
+        // std::vector<std::vector<std::vector<grp_access_type>>> m_receive_cells_access;
+        // std::vector<std::vector<std::vector<grp_access_type>>> m_send_cells_access;
+
+        std::vector<UpDownDataAccess> up_down_acces;
+        transferDataAccess<morton_type, grp_access_type> transfer_acces;
     };
 }   // namespace scalfmm::component
 
diff --git a/include/scalfmm/tree/for_each.hpp b/include/scalfmm/tree/for_each.hpp
index 449e4da07e3641d38191c56ba753846c9e8ff9b5..67d512422f0fa5b82a8586449d02931e8b17f42e 100644
--- a/include/scalfmm/tree/for_each.hpp
+++ b/include/scalfmm/tree/for_each.hpp
@@ -138,7 +138,27 @@ namespace scalfmm::component
 
         return f;
     }
+    /// @brief  For iterating on the components between the begin and the end iterator
+    /// @tparam InputTreeIterator
+    /// @tparam UnaryFunction
+    /// @param begin
+    /// @param end
+    /// @param f
+    /// @return
+    template<typename InputTreeIterator, typename UnaryFunction>
+    inline auto for_each_mine_component(InputTreeIterator begin, InputTreeIterator end,
+                                        UnaryFunction f) -> UnaryFunction
+    {
+        for(auto group_leaf_iterator_begin = begin; group_leaf_iterator_begin != end; ++group_leaf_iterator_begin)
+        {
+            for(auto&& leaf: (*group_leaf_iterator_begin)->components())
+            {
+                f(leaf);
+            }
+        }
 
+        return f;
+    }
     /**
      * @brief iterate en two (same) leaf struture (same groupe size)
      *
diff --git a/include/scalfmm/tree/group_let.hpp b/include/scalfmm/tree/group_let.hpp
index aa52cfc47b87263389a93a20bed6e0c149a96674..1b7e1fd7b639d955bda9146a606d053531cf2f36 100644
--- a/include/scalfmm/tree/group_let.hpp
+++ b/include/scalfmm/tree/group_let.hpp
@@ -1,9 +1,21 @@
-// --------------------------------
-// See LICENCE file at project root
-// File : scalfmm/tree/group_let.hpp
-// --------------------------------
-#ifndef SCALFMM_TREE_LET_HPP
+#ifndef SCALFMM_TREE_LET_HPP
 #define SCALFMM_TREE_LET_HPP
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include <cpp_tools/colors/colorized.hpp>
+#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+
+#include <scalfmm/tree/utils.hpp>
+#include <scalfmm/utils/io_helpers.hpp>   // for io::print
+#include <scalfmm/utils/math.hpp>
 
 #include "scalfmm/container/particle_container.hpp"
 #include "scalfmm/lists/sequential.hpp"
@@ -12,11 +24,8 @@
 #include "scalfmm/parallel/mpi/utils.hpp"
 #include "scalfmm/parallel/utils.hpp"
 #include "scalfmm/tree/for_each.hpp"
-#include "scalfmm/tree/utils.hpp"
-#include "scalfmm/utils/io_helpers.hpp"
-#include "scalfmm/utils/math.hpp"
-
 #ifdef SCALFMM_USE_MPI
+
 #include <inria/algorithm/distributed/distribute.hpp>
 #include <inria/algorithm/distributed/mpi.hpp>
 #include <inria/algorithm/distributed/sort.hpp>
@@ -24,33 +33,15 @@
 #include <mpi.h>
 #endif
 
-#include <cpp_tools/colors/colorized.hpp>
-#include <cpp_tools/parallel_manager/parallel_manager.hpp>
-
-#include <algorithm>
-#include <array>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <string>
-#include <tuple>
-#include <utility>
-#include <vector>
-
 namespace scalfmm::tree
 {
     using morton_type = std::int64_t;   // typename Tree_type::
 
-    /**
-     * @brief
-     *
-     * @tparam MortonIdxType
-     */
-    template<typename MortonIdxType>
+    template<typename MortonIdx>
     struct leaf_info_type
     {
-        using morton_type = MortonIdxType;
-        MortonIdxType morton{};
+        using morton_type = MortonIdx;
+        MortonIdx morton{};
         std::size_t number_of_particles{};
         friend std::ostream& operator<<(std::ostream& os, const leaf_info_type& w)
         {
@@ -62,18 +53,11 @@ namespace scalfmm::tree
     namespace let
     {
 
-        /**
-         * @brief
-         *
-         * @tparam BoxType
-         * @tparam VectorLeafInfoType
-         * @tparam MortonDistributionType
-         */
-        template<typename BoxType, typename VectorLeafInfoType, typename MortonDistributionType>
-        inline /*std::vector<morton_type>*/ VectorLeafInfoType
-        get_ghosts_p2p_interaction(cpp_tools::parallel_manager::parallel_manager& para, BoxType const& box,
-                                   std::size_t const& level, int const& separation, VectorLeafInfoType const& leaf_info,
-                                   MortonDistributionType const& leaves_distrib)
+        template<typename Box, typename VectorLeafInfo, typename MortonDistribution>
+        inline /*std::vector<morton_type>*/ VectorLeafInfo
+        get_ghosts_p2p_interaction(cpp_tools::parallel_manager::parallel_manager& para, Box const& box,
+                                   std::size_t const& level, int const& separation, VectorLeafInfo const& leaf_info,
+                                   MortonDistribution const& leaves_distrib)
         {
             std::vector<morton_type> ghost_to_add;
             auto const& period = box.get_periodicity();
@@ -83,14 +67,17 @@ namespace scalfmm::tree
             for(auto const& info: leaf_info)
             {
                 auto const& morton_index = info.morton;
-                auto coordinate{index::get_coordinate_from_morton_index<BoxType::dimension>(morton_index)};
+                auto coordinate{index::get_coordinate_from_morton_index<Box::dimension>(morton_index)};
                 auto interaction_neighbors = index::get_neighbors(coordinate, level, period, separation);
                 auto& list = std::get<0>(interaction_neighbors);
                 auto nb = std::get<1>(interaction_neighbors);
                 int it{0};
+                //io::print("rank(" + std::to_string(rank) + ") list idx(p2p)  : ", list);
 
                 while(list[it] < my_distrib[0])
                 {
+                    // std::cout << "INSIDE left idx " << list[it] << "  " << std::boolalpha
+                    //             << parallel::utils::is_inside_distrib(list[it], leaves_distrib) << std::endl;
                     if(parallel::utils::is_inside_distrib_left(list[it], rank, leaves_distrib))
                     {
                         ghost_to_add.push_back(list[it]);
@@ -100,6 +87,8 @@ namespace scalfmm::tree
                 it = nb - 1;
                 while(list[it] >= my_distrib[1])
                 {
+                    //      std::cout << "INSIDE right idx " << list[it] << "  " << std::boolalpha
+                    //               << parallel::utils::is_inside_distrib(list[it], leaves_distrib) << std::endl;
                     if(parallel::utils::is_inside_distrib_right(list[it], rank, leaves_distrib))
                     {
                         ghost_to_add.push_back(list[it]);
@@ -110,7 +99,7 @@ namespace scalfmm::tree
             std::sort(ghost_to_add.begin(), ghost_to_add.end());
             auto last = std::unique(ghost_to_add.begin(), ghost_to_add.end());
             ghost_to_add.erase(last, ghost_to_add.end());
-            VectorLeafInfoType ghost_leaf_to_add(ghost_to_add.size());
+            VectorLeafInfo ghost_leaf_to_add(ghost_to_add.size());
             for(int i = 0; i < ghost_to_add.size(); ++i)
             {
                 ghost_leaf_to_add[i] = {ghost_to_add[i], 0};
@@ -118,32 +107,25 @@ namespace scalfmm::tree
 
             return ghost_leaf_to_add;
         }
-
-        /**
-         * @brief  get theoretical m2l interaction list outside me
-         *
-         * We return the list of indexes of cells involved in P2P interaction that we do
-         *  not have locally.  The cells on other processors may not exist.
-         *
-         * @tparam BoxType
-         * @tparam VectorMortonIdxType
-         * @tparam MortonDistributionType
-         * @param[in] para the parallel manager
-         * @param box
-         * @param level
-         * @param separation
-         * @param local_morton_vect
-         * @param cell_distrib the cells distribution on the processes
-         * @return the list of indexes on tother processes
-         */
-        template<typename BoxType, typename VectorMortonIdxType, typename MortonDistributionType>
-        inline VectorMortonIdxType get_ghosts_m2l_interaction(cpp_tools::parallel_manager::parallel_manager& para,
-                                                              BoxType const& box, const std::size_t& level,
-                                                              int const& separation,
-                                                              VectorMortonIdxType const& local_morton_vect,
-                                                              MortonDistributionType const& cell_distrib)
+        ///
+        /// \brief  get theoretical m2l interaction list outside me
+        ///
+        /// We return the list of indexes of cells involved in P2P interaction that we do
+        ///  not have locally.  The cells on other processors may not exist.
+        ///
+        /// \param[in]  para the parallel manager
+        /// \param tree the tree used to compute the interaction
+        /// \param local_morton_idx the local morton index of the cells
+        /// \param cell_distrib the cells distribution on the processes
+        /// \return the list of indexes on tother processes
+        ///
+        template<typename Box, typename VectorMortonIdx, typename MortonDistribution>
+        inline VectorMortonIdx
+        get_ghosts_m2l_interaction(cpp_tools::parallel_manager::parallel_manager& para, Box const& box,
+                                   const std::size_t& level, int const& separation,
+                                   VectorMortonIdx const& local_morton_vect, MortonDistribution const& cell_distrib)
         {
-            VectorMortonIdxType ghost_to_add;
+            VectorMortonIdx ghost_to_add;
             auto const& period = box.get_periodicity();
             const auto rank = para.get_process_id();
             auto const my_distrib = cell_distrib[rank];
@@ -152,36 +134,64 @@ namespace scalfmm::tree
             for(auto morton_index: local_morton_vect)
             {
                 // for each index in the vector of cells in local_morton_vect we compute the m2l interactions
-                auto coordinate{index::get_coordinate_from_morton_index<BoxType::dimension>(morton_index)};
+                auto coordinate{index::get_coordinate_from_morton_index<Box::dimension>(morton_index)};
                 auto interaction_m2l_list = index::get_m2l_list(coordinate, level, period, separation);
 
                 auto& list = std::get<0>(interaction_m2l_list);
                 auto nb = std::get<2>(interaction_m2l_list);
+                //
+                // io::print("rank(" + std::to_string(rank) + ") list idx(m2l)  : ", list);
+                // io::print("rank(" + std::to_string(rank) + ") my_distrib  : ", my_distrib);
 
                 int it{0};
                 // We check if the cells are in the distribution
                 for(auto it = 0; it < nb; ++it)
                 {
+                    // if(list[it] > my_distrib[0])
+                    // std::cout << list[it] << " " << std::boolalpha
+                    //           << math::between(list[it], my_distrib[0], my_distrib[1]) << std::endl;
+
                     if(math::between(list[it], my_distrib[0], my_distrib[1]))
                     {
                         break;
                     }
                     bool check{false};
+                    // for(int i = 0; i < rank; ++i)
                     for(int i = rank - 1; i >= 0; i--)
                     {
                         auto const& interval = cell_distrib[i];
+                        // // if(rank == 2)
+                        // {
+                        //     std::cout << "parallel::utils::is_inside_distrib_left list[it]: " << interval[0] << " < "
+                        //     << list[it]
+                        //               << " < " << interval[1] << std::endl;
+                        // }
                         check = math::between(list[it], interval[0], interval[1]);
                         if(check)
                         {
                             break;
                         }
                     }
+                    // std::cout << "                 " << list[it] << "  " << std::boolalpha << check << std::endl;
                     if(check)   // parallel::utils::is_inside_distrib_left(list[it], rank, cell_distrib))
                     {
                         ghost_to_add.push_back(list[it]);
                     }
                 }
-
+                // while(list[it] < my_distrib[0])
+                // {
+                //     std::cout << it << " INSIDE left idx " << list[it] << "  " << std::boolalpha
+                //               << parallel::utils::is_inside_distrib(list[it], cell_distrib) << std::endl;
+                //     if(parallel::utils::is_inside_distrib_left(list[it], rank, cell_distrib))
+                //     {
+                //         ghost_to_add.push_back(list[it]);
+                //     }
+                //     ++it;
+                //     if(it > nb)
+                //     {
+                //         break;
+                //     }
+                // }
                 it = nb - 1;
                 if(not last_proc)   // No ghost on the right on last process
                 {
@@ -194,22 +204,20 @@ namespace scalfmm::tree
                         --it;
                     }
                 }
+                // if(rank == 2)
+                // {
+                //     io::print("rank(" + std::to_string(rank) + ") tmp ghost_to_add(m2l)  : ", ghost_to_add);
+                // }
             }
             std::sort(ghost_to_add.begin(), ghost_to_add.end());
             auto last = std::unique(ghost_to_add.begin(), ghost_to_add.end());
             ghost_to_add.erase(last, ghost_to_add.end());
+            // io::print("rank(" + std::to_string(rank) + ") cell_distrib: ", cell_distrib);
+            // io::print("rank(" + std::to_string(rank) + ") ghost_to_add(m2l): ", ghost_to_add);
 
             return ghost_to_add;
         }
 
-        /**
-         * @brief
-         *
-         * @tparam VectorLeafInfoType
-         * @param localLeaves
-         * @param ghosts
-         * @return auto
-         */
         template<typename VectorLeafInfoType>
         auto merge_split_structure(VectorLeafInfoType const& localLeaves, VectorLeafInfoType const& ghosts)
         {
@@ -257,7 +265,6 @@ namespace scalfmm::tree
 
             return std::make_tuple(morton, number_of_particles);
         }
-
         /**
          * @brief Split the LeafInfo structure in two vectors (Morton, number_of_particles)
          *
@@ -282,15 +289,6 @@ namespace scalfmm::tree
             }
             return std::make_tuple(morton, number_of_particles);
         }
-
-        /**
-         * @brief
-         *
-         * @tparam VectorLeafInfoIteratorType
-         * @param begin
-         * @param end
-         * @return auto
-         */
         template<typename VectorLeafInfoIteratorType>
         auto split_structure(const VectorLeafInfoIteratorType begin, const VectorLeafInfoIteratorType end)
         {
@@ -319,8 +317,8 @@ namespace scalfmm::tree
          * @return the vector ot the three blocs
          */
         template<typename VectorBlockType>
-        auto merge_blocs(VectorBlockType const& bloc1, VectorBlockType const& bloc2,
-                         VectorBlockType const& bloc3) -> VectorBlockType
+        VectorBlockType merge_blocs(VectorBlockType const& bloc1, VectorBlockType const& bloc2,
+                                    VectorBlockType const& bloc3)
         {
             // Merge the three block structure
             auto size = bloc1.size() + bloc2.size() + bloc3.size();
@@ -341,38 +339,41 @@ namespace scalfmm::tree
             }
             return all_blocks;
         }
-
         /**
          * @brief Construct the M2M ghost for the current level
          *
          *  The routine check if there is ghosts during the M2M operation.
          *  If yes, we exchange the ghost indexes
-         * @tparam BoxType
-         * @tparam VectorMortonIdxType
-         * @tparam MortonDistributionType
+         * @tparam Box
+         * @tparam VectorMortonIdx
+         * @tparam MortonDistribution
          * @param para  the parallel manager
          * @param box  the simulation box
          * @param level the current level
          * @param local_morton_vect
          * @param cells_distrib teh distribution of cells
          * @param top if top is true nothing is down
-         * @return VectorMortonIdxType
+         * @return VectorMortonIdx
          */
-        template<typename BoxType, typename VectorMortonIdxType, typename MortonDistributionType>
-        [[nodiscard]] auto build_ghost_m2m_let_at_level(cpp_tools::parallel_manager::parallel_manager& para,
-                                                        BoxType& box, const int& level,
-                                                        const VectorMortonIdxType& local_morton_vect,
-                                                        const MortonDistributionType& cells_distrib,
-                                                        bool top = false) -> VectorMortonIdxType
+        template<typename Box, typename VectorMortonIdx, typename MortonDistribution>
+        [[nodiscard]] auto build_ghost_m2m_let_at_level(cpp_tools::parallel_manager::parallel_manager& para, Box& box,
+                                                        const int& level, VectorMortonIdx const& local_morton_vect,
+                                                        MortonDistribution const& cells_distrib,
+                                                        bool top = false) -> VectorMortonIdx
         {
-            using morton_type = typename VectorMortonIdxType::value_type;
-            static constexpr int nb_children = math::pow(2, BoxType::dimension);
-            VectorMortonIdxType ghosts;
+            using morton_type = typename VectorMortonIdx::value_type;
+            static constexpr int nb_children = math::pow(2, Box::dimension);
+            VectorMortonIdx ghosts;
+            std::clog << " begin build_ghost_m2m_let_at_level " << level << std::endl;
+            io::print(std::clog, "local_morton_vect: ", local_morton_vect);
+            std::clog << std::endl;
+
             if(top)
                 return ghosts;
             const auto rank = para.get_process_id();
             const auto proc = para.get_num_processes();
             auto comm = para.get_communicator();
+            // parallel::utils::print_distrib("level_dist[" + std::to_string(level) + "]: ", rank, cells_distrib);
 
             cpp_tools::parallel_manager::mpi::request mpi_status_left, mpi_status_right;
 
@@ -385,20 +386,26 @@ namespace scalfmm::tree
             //             parallel::utils::print_distrib("level_dist[leaf_level]): ", rank, cells_distrib);
             bool comm_left{false}, comm_right{false};
             // Check on left
+            std::clog << "check left\n ";
+
             if(rank > 0)
             {
                 auto first_index = local_morton_vect[0];
-                auto parent_first = first_index >> BoxType::dimension;
-                auto last_parent_previous_proc = cells_distrib[rank - 1][1] >> BoxType::dimension;
+                auto parent_first = first_index >> Box::dimension;
+                auto last_parent_previous_proc = (cells_distrib[rank - 1][1] - 1) >> Box::dimension;
+                // std::clog << "index : " << first_index << "   Parent ! " << parent_first << " "
+                //           << last_parent_previous_proc << std::endl;
 
                 if(parent_first == last_parent_previous_proc)
                 {
                     comm_left = true;
+                    //                    std::cout << "Need to exchange between " << rank << "  and " << rank - 1 << std::endl;
                     int idx{1};
                     send[idx] = local_morton_vect[0];
                     for(int i = 1; i < std::min(nb_children, int(local_morton_vect.size())); ++i)
                     {
-                        auto parent_index = local_morton_vect[i] >> BoxType::dimension;
+                        auto parent_index = local_morton_vect[i] >> Box::dimension;
+                        //			std::cout << "index : " << local_morton_vect[i] << "   Parent ! " << parent_first << " " << last_parent_previous_proc << std::endl;
                         if(parent_index == last_parent_previous_proc)
                         {
                             ++idx;
@@ -415,22 +422,23 @@ namespace scalfmm::tree
                     comm.isend(send.data(), nb_children, mpi_type, rank - 1, tag);
                 }
             }
-            //	    std::cout <<  "check right\n ";
+            // std::clog << "check right\n ";
             auto last_index = local_morton_vect[local_morton_vect.size() - 1];
-            auto parent_last = last_index >> BoxType::dimension;
-            //	    std::cout <<  "last_index " << last_index << " parent_last  " << parent_last <<std::endl;
+            auto parent_last = last_index >> Box::dimension;
+            // std::clog << "last_index " << last_index << " parent_last  " << parent_last << std::endl;
             ghosts.resize(0);
             if(rank < proc - 1)
             {
                 // check on left
-                auto first_parent_next_proc = cells_distrib[rank + 1][0] >> BoxType::dimension;
-                //                std::cout << "Parent ! " << parent_last << " " << first_parent_next_proc << std::endl;
+                auto first_parent_next_proc = cells_distrib[rank + 1][0] >> Box::dimension;
+                // std::clog << "Parent ! " << parent_last << " " << first_parent_next_proc << std::endl;
                 if(parent_last == first_parent_next_proc)
                 {
                     comm_right = true;
                     //                    std::cout << "Need to exchange between " << rank << "  and " << rank + 1 << std::endl;
                     /*mpi_status_right =*/comm.recv(recv.data(), nb_children, mpi_type, rank + 1, tag);
-
+                    // cpp_tools::parallel_manager::mpi::request::waitall(1, &mpi_status_right);
+                    //		                 io::print("recv ",recv );
                     ghosts.resize(recv[0]);
                     for(int i = 0; i < ghosts.size(); ++i)
                     {
@@ -438,42 +446,43 @@ namespace scalfmm::tree
                     }
                 }
             }
-
+            // io::print(std::clog, "m2m ghosts ", ghosts);
+            // std::clog << " end build_ghost_m2m_let_at_level" << std::endl;
             return ghosts;
         }
-
-        /**
-         * @brief construct the local essential tree (LET) at the level.
-         *
-         *  We start from a given Morton index distribution and we compute all
-         *  interactions needed
-         *   in the algorithm steps.
-         *  At the leaf level it corresponds to the interactions coming from the
-         *  direct pass (P2P operators)
-         *     and in the transfer pass (M2L operator). For the other levels we
-         *     consider only the M2L interactions.
-         * The leaves_distrib and the cells_distrib might be different
-         *  At the end the let has also all the interaction list computed
-         *
-         * @tparam BoxType
-         * @tparam VectorMortonIdxType
-         * @tparam MortonDistributionType
-         * @param para
-         * @param box
-         *  @param[in] level the level to construct the let
-         * @param local_morton_vect
-         *  @param[in] cells_distrib the morton index distribution for
-         * the cells at the leaf level.
-         * @param separation
-         * @return VectorMortonIdxType
-         */
-        template<typename BoxType, typename VectorMortonIdxType, typename MortonDistributionType>
-        [[nodiscard]] auto build_let_at_level(cpp_tools::parallel_manager::parallel_manager& para, BoxType& box,
-                                              const int& level, const VectorMortonIdxType& local_morton_vect,
-                                              const MortonDistributionType& cells_distrib,
-                                              const int& separation) -> VectorMortonIdxType
+        ///
+        /// \brief construct the local essential tree (LET) at the level.
+        ///
+        ///  We start from a given Morton index distribution and we compute all
+        ///  interactions needed
+        ///   in the algorithm steps.
+        ///  At the leaf level it corresponds to the interactions coming from the
+        ///  direct pass (P2P operators)
+        ///     and in the transfer pass (M2L operator). For the other levels we
+        ///     consider only the M2L interactions.
+        /// The leaves_distrib and the cells_distrib might be different
+        ///  At the end the let has also all the interaction list computed
+        ///
+        /// \param[inout]  tree the tree to compute the let.
+        /// \param[in]  local_morton_idx the morton index of the particles in the
+        /// processors.
+        ///
+        ///
+        ///  \param[in]  cells_distrib the morton index distribution for
+        /// the cells at the leaf level.
+        ///
+        ///  \param[in]  level the level to construct the let
+        ///
+        template<typename Box, typename VectorMortonIdx, typename MortonDistribution>
+        [[nodiscard]] auto build_let_at_level(cpp_tools::parallel_manager::parallel_manager& para, Box& box,
+                                              const int& level, const VectorMortonIdx& local_morton_vect,
+                                              const MortonDistribution& cells_distrib,
+                                              const int& separation) -> VectorMortonIdx
         {
             const auto my_rank = para.get_process_id();
+            // std::cout << cpp_tools::colors::red << " --> Begin let::build_let_at_level() at level = " << level
+            //           << "dist: " << cells_distrib[my_rank] << cpp_tools::colors::reset << std::endl;
+            // io::print("rank(" + std::to_string(my_rank) + ") local_morton_vect  : ", local_morton_vect);
 
             //  we compute the cells needed in the M2L operator
 
@@ -485,10 +494,15 @@ namespace scalfmm::tree
             std::cout << std::flush;
             /// Look if the morton index really exists in the distributed tree
             parallel::utils::check_if_morton_index_exist(para, needed_idx, cells_distrib, local_morton_vect);
+            ///
+            // io::print("rank(" + std::to_string(my_rank) + ") check_if_morton_index_exist(m2l)  : ", needed_idx);
+            //
 
+            // std::cout << cpp_tools::colors::red
+            //           << "rank(" + std::to_string(my_rank) + ")-- > End let::build_let_at_level() "
+            //           << cpp_tools::colors::reset << std::endl;
             return needed_idx;
         }
-
         // template<typename OctreeTree, typename VectorMortonIdx, typename MortonDistribution>
         // void build_let_at_level(cpp_tools::parallel_manager::parallel_manager& para, OctreeTree& tree,
         //                         const VectorMortonIdx& local_morton_idx, const MortonDistribution& cells_distrib,
@@ -516,13 +530,12 @@ namespace scalfmm::tree
         //     std::cout << cpp_tools::colors::green << " --> End let::build_let_at_level() at level = " << level
         //               << cpp_tools::colors::reset << std::endl;
         // }
-
         /**
          * @brief
          *
-         * @tparam BoxType
-         * @tparam VectorMortonIdxType
-         * @tparam MortonDistributionType
+         * @tparam Box
+         * @tparam VectorMortonIdx
+         * @tparam MortonDistribution
          * @param para
          * @param box
          * @param level
@@ -530,15 +543,19 @@ namespace scalfmm::tree
          * @param leaves_distrib
          * @param separation
          */
-        template<typename BoxType, typename VectorLeafInfo, typename MortonDistributionType>
-        [[nodiscard]] auto build_let_leaves(cpp_tools::parallel_manager::parallel_manager& para, BoxType const& box,
+        template<typename Box, typename VectorLeafInfo, typename MortonDistribution>
+        [[nodiscard]] auto build_let_leaves(cpp_tools::parallel_manager::parallel_manager& para, Box const& box,
                                             const std::size_t& level,
                                             const VectorLeafInfo& leaf_info /*local_morton_vect*/,
-                                            MortonDistributionType const& leaves_distrib, const int& separation)
+                                            MortonDistribution const& leaves_distrib, const int& separation)
 
           -> VectorLeafInfo
         {
             auto my_rank = para.get_process_id();
+            //             std::cout << cpp_tools::colors::green
+            //                       << "rank(" + std::to_string(my_rank) + ") --> Begin let::build_let_leaves() "
+            //                       << cpp_tools::colors::reset << std::endl;
+            //             io::print("rank(" + std::to_string(my_rank) + ") leaf_info  : ", leaf_info);
 
             //  we compute the leaves involved in the P2P operators
             auto leaf_info_to_add =
@@ -554,7 +571,9 @@ namespace scalfmm::tree
             /// needed_idx input  contains the Morton index of leaf
             ///            output   contains the number of particles in the leaf
 
+            //           io::print("rank(" + std::to_string(my_rank) + ") 1 leaf_info_to_add(p2p)  : ", leaf_info_to_add);
             parallel::utils::check_if_leaf_morton_index_exist(para, needed_idx, leaves_distrib, leaf_info);
+            //            io::print("rank(" + std::to_string(my_rank) + ") check needed_idx.size  : ", needed_idx);
             int idx{0};
 
             for(int i = 0; i < needed_idx.size(); ++i)
@@ -571,49 +590,54 @@ namespace scalfmm::tree
                 auto last = leaf_info_to_add.cbegin() + idx;
                 leaf_info_to_add.erase(last, leaf_info_to_add.end());
             }
-
+            ///
+            //             io::print("rank(" + std::to_string(my_rank) + ") final leaf_info_to_add(p2p)  : ", leaf_info_to_add);
+            //             std::cout << cpp_tools::colors::green
+            //                       << "rank(" + std::to_string(my_rank) + ")-- > End let::build_let_leaves() "
+            //                       << cpp_tools::colors::reset << std::endl;
             return leaf_info_to_add;
         }
 
-        /**
-         * @brief buildLetTree  Build the let of the tree and the leaves and cells distributions
-         *
-         * The algorithm has 5 steps:
-         *   1) We sort the particles according to their Morton Index (leaf level)
-         *   2) Build the leaf morton vector of my local particles and construct either
-         *      the leaves distribution or the cell distribution according to parameter
-         *       use_leaf_distribution or use_particle_distribution
-         *   3) Fit the particles inside the use_leaf_distribution
-         *   4) Construct the  tree according to my particles and build the leaf
-         *       morton vector of my local particles
-         *   5) Constructing the let level by level
-         *
-         * @tparam TreeType
-         * @tparam VectorType
-         * @tparam BoxType
-         * @param[in] manager   the parallel manager
-         * @param[in] number_of_particles  total number of particles in the simulation
-         * @param[in] particle_container   vector of particles on my node. On output the array is sorted and correspond to teh distribution built
-         * @param[in] box  size of the simulation box
-         * @param[in] leaf_level   level of the leaf in the tree
-         * @param[in] level_shared the level at which cells are duplicated on processors. If the level is negative,
-         * nothing is duplicated.
-         * @param[in] groupSizeLeaves blocking parameter for the leaves (particles)
-         * @param[in] groupSizeCells blocking parameter for the cells
-         * @param[in] order order of the approximation to build the tree
-         * @param[in] use_leaf_distribution to say if you consider the leaf distribution
-         * @param[in] use_particle_distribution to say if you consider the particle distribution
-         * @return localGroupTree the LET of the octree processors
-         */
-        template<typename TreeType, typename VectorType, typename BoxType>
-        auto buildLetTree(cpp_tools::parallel_manager::parallel_manager& manager,
-                          const std::size_t& number_of_particles, VectorType& particle_container, const BoxType& box,
-                          const int& leaf_level, const int& level_shared, const int groupSizeLeaves,
-                          const int groupSizeCells, const int order, const int separation,
-                          const bool use_leaf_distribution, const bool use_particle_distribution) -> TreeType
+        ///
+        /// \brief buildLetTree  Build the let of the tree and the leaves and cells distributions
+        ///
+        /// The algorithm has 5 steps:
+        ///   1) We sort the particles according to their Morton Index (leaf level)
+        ///   2) Build the leaf morton vector of my local particles and construct either
+        ///      the leaves distribution or the cell distribution according to parameter
+        ///       use_leaf_distribution or use_particle_distribution
+        ///   3) Fit the particles inside the use_leaf_distribution
+        ///   4) Construct the  tree according to my particles and build the leaf
+        ///       morton vector of my local particles
+        ///   5) Constructing the let level by level
+        ///
+        /// \param[in]    manager   the parallel manager
+        /// \param[in] number_of_particles  total number of particles in the simulation
+        /// \param[in]  particle_container   vector of particles on my node. On output the
+        ///                 array is sorted and correspond to teh distribution built
+        /// \param[in]     box  size of the simulation box
+        /// \param[in] leaf_level   level of the leaf in the tree
+        /// \param[in] level_shared the level at which cells are duplicated on processors. If the level is negative,
+        /// nothing is duplicated.
+        /// \param[in] groupSizeLeaves  blocking parameter for the leaves (particles)
+        /// \param[in] groupSizeCells    blocking parameter for the cells
+        /// @param[in]    order order of the approximation to build the tree
+        /// @param[in]    use_leaf_distribution to say if you consider the leaf distribution
+        /// @param[in]    use_particle_distribution to say if you consider the particle distribution
+        /// @return  localGroupTree  the LET of the octree
+
+        /// processors
+        template<typename Tree_type, typename Vector_type, typename Box_type>
+        Tree_type
+        buildLetTree(cpp_tools::parallel_manager::parallel_manager& manager, const std::size_t& number_of_particles,
+                     Vector_type& particle_container, const Box_type& box, const int& leaf_level,
+                     const int& level_shared, const int groupSizeLeaves, const int groupSizeCells, const int order,
+                     const int separation, const bool use_leaf_distribution, const bool use_particle_distribution)
         {
+            // std::cout << cpp_tools::colors::green << " --> Begin let::group_let() " << cpp_tools::colors::reset
+            //           << std::endl;
             //
-            static constexpr std::size_t dimension = VectorType::value_type::dimension;
+            static constexpr std::size_t dimension = Vector_type::value_type::dimension;
             const auto rank = manager.get_process_id();
             ////////////////////////////////////////////////////////////////////////////
             ///   Sort the particles at the leaf level according to their Morton index
@@ -646,27 +670,28 @@ namespace scalfmm::tree
             {
                 particleMortonIndex[part] =
                   scalfmm::index::get_morton_index(particle_container[part].position(), box, leaf_level);
-                //                std::cout << part << " m  " << particleMortonIndex[part] << particle_container[part] << std::endl;
+                // std::cout << part << " m  " << particleMortonIndex[part] << particle_container[part] << std::endl;
             }
             auto leafMortonIdx(particleMortonIndex);
             // delete duplicate indexes
             auto last = std::unique(leafMortonIdx.begin(), leafMortonIdx.end());
             leafMortonIdx.erase(last, leafMortonIdx.end());
             ///////////////////////////////////////////////////////////////////////////////////
-            io::print("rank(" + std::to_string(rank) + ")  -->  init leafMortonIdx: ", leafMortonIdx);
+            // io::print("rank(" + std::to_string(rank) + ")  -->  init leafMortonIdx: ", leafMortonIdx);
             ///
             ////////////////////////////////////////////////////////////////////////////////////////////
             ////   Construct a uniform distribution for the leaves/cells at the leaves level
             ///
             /// A morton index should be own by only one process
             ///
-            using morton_distrib_type = typename TreeType::data_distrib_type;
+            using morton_distrib_type = typename Tree_type::data_distrib_type;
 
             ///
             ///  Build a uniform distribution of the leaves/cells
             ///  Here the distribution is a closed interval and not semi open one !!!
             ///
             morton_distrib_type leaves_distrib;
+            morton_distrib_type particles_distrib(manager.get_num_processes());
             if(use_leaf_distribution)
             {
                 leaves_distrib = std::move(scalfmm::parallel::utils::balanced_leaves(manager, leafMortonIdx));
@@ -675,6 +700,7 @@ namespace scalfmm::tree
                     interval[1] += 1;
                 }
             }
+            //            io::print("rank(" + std::to_string(rank) + ")  -->  leaves_distrib: ", leaves_distrib);
             ////                End
             ////////////////////////////////////////////////////////////////////////////////////////////
             ///
@@ -685,7 +711,6 @@ namespace scalfmm::tree
             ///
             /// A morton index should be own by only one process
             ///
-            morton_distrib_type particles_distrib(manager.get_num_processes());
             if(use_particle_distribution)
             {
                 particles_distrib = std::move(scalfmm::parallel::utils::balanced_particles(
@@ -721,13 +746,14 @@ namespace scalfmm::tree
             ///
             parallel::utils::fit_particles_in_distrib(manager, particle_container, particleMortonIndex,
                                                       particles_distrib, box, leaf_level, number_of_particles);
+            // io::print("rank(" + std::to_string(rank) + ")  --> particle_container: ", particle_container);
             ///    All the particles are located on the good process
             ////////////////////////////////////////////////////////////////////////////////////////////
             ///
             ///   Construct the local tree based on our set of particles
             // Build and empty tree
-            TreeType localGroupTree(manager, static_cast<std::size_t>(leaf_level + 1), level_shared, order,
-                                    groupSizeLeaves, groupSizeCells, box);
+            Tree_type localGroupTree(manager, static_cast<std::size_t>(leaf_level + 1), level_shared, order,
+                                     groupSizeLeaves, groupSizeCells, box);
             /// Set true because the particles are already sorted
             ///  In fact we have all the leaves to add in leafMortonIdx - could be used to construct
             /// the tree !!!
@@ -735,6 +761,9 @@ namespace scalfmm::tree
 
 #ifdef SCALFMM_USE_MPI
 
+            // std::cout << cpp_tools::colors::red;
+            // io::print("rank(" + std::to_string(rank) + ") leafMortonIdx: ", leafMortonIdx);
+            // std::cout << cpp_tools::colors::reset << std::endl;
             ///  End
             ////////////////////////////////////////////////////////////////////////////////////////////
             ///
@@ -746,6 +775,7 @@ namespace scalfmm::tree
             {
                 leafMortonIdx[i] = scalfmm::index::get_morton_index(particle_container[i].position(), box, leaf_level);
             }
+            // io::print("rank(" + std::to_string(rank) + ")  -->  leafMortonIdx:    ", leafMortonIdx);
 
             // localLeafInfo contains information on leaves (morton, number of particles) own by th current process
             std::vector<tree::leaf_info_type<morton_type>> localLeafInfo(leafMortonIdx.size());
@@ -770,6 +800,8 @@ namespace scalfmm::tree
             }
             leafMortonIdx.resize(idx + 1);
             localLeafInfo.resize(leafMortonIdx.size());
+            // io::print("rank(" + std::to_string(rank) + ")  -->  localLeafInfo:    ", localLeafInfo);
+            // io::print("rank(" + std::to_string(rank) + ")  -->  leafMortonIdx:    ", leafMortonIdx);
             ////////////////////////////////////////////////////////////////////////////////////////
             // Build the pointer of the tree with all parameters
 
@@ -783,9 +815,17 @@ namespace scalfmm::tree
 
                 auto ghostP2P_leafInfo =
                   build_let_leaves(manager, box, leaf_level, localLeafInfo, particles_distrib, separation);
+                // io::print("rank(" + std::to_string(rank) + ")  -->  final ghostP2P_leafInfo:    ",
+                // ghostP2P_leafInfo); io::print("rank(" + std::to_string(rank) + ")  -->  final localLeafInfo:    ",
+                // localLeafInfo);
 
                 localGroupTree.set_leaf_distribution(particles_distrib);
 
+                // std::cout << std::flush;
+                // std::cout << cpp_tools::colors::red;
+                // std::cout << "END LEAF LEVEL " << std::endl;
+                // std::cout << cpp_tools::colors::reset;
+
                 /// If the distribution is not the same for the leaf and the cell we redistribute the
                 /// morton index according to the uniform distribution of morton index
                 ///
@@ -817,41 +857,66 @@ namespace scalfmm::tree
 
                 auto ghost_m2l_cells =
                   build_let_at_level(manager, box, leaf_level, leafMortonIdx, level_dist[leaf_level], separation);
+                // io::print("rank(" + std::to_string(rank) + ")  -->  final ghost_cells(m2l):    ", ghost_m2l_cells);
                 auto ghost_m2m_cells =
                   build_ghost_m2m_let_at_level(manager, box, leaf_level, leafMortonIdx, level_dist[leaf_level]);
+                //                 io::print("rank(" + std::to_string(rank) + ")  -->  ghost_cells(m2m):    ", ghost_m2m_cells);
 
                 // distribution, particles
+                // std::cout << "   $$$$$$$$$$$$$$$$$$$$$$$$$ leaf level " << leaf_level << " $$$$$$$$$$$$$$$$$$$$$$$$$ "
+                //           << std::endl;
 
                 localGroupTree.create_from_leaf_level(localLeafInfo, ghostP2P_leafInfo, ghost_m2l_cells,
                                                       ghost_m2m_cells, particles_distrib[rank],
                                                       level_dist[leaf_level][rank]);
+                // std::cout << "   $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ leaf level $$$$$$$$$$$$$$$$$$$$$$$$$$ "
+                //           << std::endl;
+                // parallel::utils::print_distrib("leaf_cell distribution ", rank, level_dist[leaf_level]);
 
                 // build all leaves between leaf_level - 1 and level_shared -1.
                 //  we use the maximum because if we don't share certain levels this number is <0
+                // std::cout << "std::max(level_shared, int(localGroupTree.top_level())) "
+                //           << std::max(level_shared, int(localGroupTree.top_level())) << std::endl;
+                // std::cout << "  XXXXXXXXXX -> std::max(level_shared, int(localGroupTree.top_level() - 1))"
+                //           << std::max(level_shared, int(localGroupTree.top_level() - 1)) << std::endl;
+                ;
                 for(int level = leaf_level - 1; level >= localGroupTree.top_level(); --level)
                 {
+                    // std::cout << "   $$$$$$$$$$$$$$$$$$$$$$$$$ level " << level << "   $$$$$$$$$$$$$$$$$$$$$$$$$ "
+                    //           << std::endl;
                     std::int64_t ghost_l2l_cell{-1};
 
                     // Get the distribution at the current level, the ghost cell involved in l2l operator
                     //  and the morton index of the existing cells at this level
                     level_dist[level] = std::move(parallel::utils::build_upper_distribution(
                       manager, dimension, level, leafMortonIdx, ghost_l2l_cell, level_dist[level + 1]));
+                    // io::print("rank(" + std::to_string(rank) + ") MortonIdx(" + std::to_string(level) + "): ",
+                    //           leafMortonIdx);
+                    // std::cout << " ghost_l2l_cell: " << ghost_l2l_cell << std::endl;
                     // Set the distribution in tres tree
                     localGroupTree.set_cell_distribution(level, level_dist[level]);
                     // build the m2l ghost cells at this level
                     auto ghost_cells_level =
                       build_let_at_level(manager, box, level, leafMortonIdx, level_dist[level], separation);
 
+                    // io::print("rank(" + std::to_string(rank) + ") level=" + std::to_string(level) +
+                    //             " -->  final ghost_cells(m2l):    ",
+                    //           ghost_cells_level);
                     // build the m2m ghost cells at this level
 
                     auto ghost_m2m_cells = build_ghost_m2m_let_at_level(
                       manager, box, leaf_level, leafMortonIdx, level_dist[level], level == localGroupTree.top_level());
-                    //                     io::print("rank(" + std::to_string(rank) + ")  -->  ghost_cells(m2m):    ", ghost_m2m_cells);
+                    // io::print("rank(" + std::to_string(rank) + ")  -->  ghost_cells(m2m):    ", ghost_m2m_cells);
+
                     // Create the groupe of cells structure for this level
                     localGroupTree.create_cells_at_level(level, leafMortonIdx, ghost_cells_level, ghost_m2m_cells,
                                                          ghost_l2l_cell, level_dist[level][rank]);
+                    // std::cout << "   $$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$$ " << std::endl
+                    //           << std::flush;
                 }
+                // std::cout << " end loop\n" << std::flush;
                 manager.get_communicator().barrier();
+                // std::cout << " end barrier\n" << std::flush;
             }
             else
 #endif          // SCALFMM_USE_MPI
@@ -867,6 +932,7 @@ namespace scalfmm::tree
                 localGroupTree.construct(particleMortonIndex);
                 // then, we fill each leaf with its particles (the particle container is sorted )
                 localGroupTree.fill_leaves_with_particles(particle_container);
+
                 //
                 localGroupTree.set_leaf_distribution(particles_distrib);
                 localGroupTree.set_cell_distribution(leaf_level, leaves_distrib);
@@ -879,8 +945,17 @@ namespace scalfmm::tree
                 }
             }
 
+            // std::cout << cpp_tools::colors::red << std::endl << std::flush;
+            // std::cout << "set iterators \n" << std::flush << std::flush;
             localGroupTree.set_valid_iterators(true);
+            // std::cout << "begin fill_leaves_with_particles \n" << std::flush;
             localGroupTree.fill_leaves_with_particles(particle_container);
+            // std::cout << "end fill_leaves_with_particles \n" << std::flush;
+
+            // std::cout << cpp_tools::colors::reset << std::endl;
+            // std::cout << cpp_tools::colors::green << " --> End let::group_let() " << cpp_tools::colors::reset
+            //           << std::endl
+            //           << std::flush;
 
             return localGroupTree;
         }
diff --git a/include/scalfmm/tree/group_tree_view.hpp b/include/scalfmm/tree/group_tree_view.hpp
index 500ec1c22acf194a049502cd13215dab7a064a3d..b5d0c1a5b604571632c7ea48a9431d11d8bd054e 100644
--- a/include/scalfmm/tree/group_tree_view.hpp
+++ b/include/scalfmm/tree/group_tree_view.hpp
@@ -911,7 +911,7 @@ namespace scalfmm::component
                 }
                 ++group_index;
             }
-#ifndef _DEBUG_BLOCK_DATA
+#ifdef _DEBUG_BLOCK_DATA
             std::clog << "  FINAl block\n";
             int tt{0};
             for(auto pg: m_group_of_leaves)
@@ -952,16 +952,31 @@ namespace scalfmm::component
             }
         }
 
-        /**
-         * @brief Resets all particles (positions, inputs, outputs and variables).
-         *
-         */
-        inline auto reset_particles() -> void
+        /// @brief reset all outputs in particle structure
+        ///
+        /// @return
+        inline auto number_particles() -> std::size_t
+        {
+            std::size_t nb{0};
+            for(auto it = this->begin_mine_leaves(); it != end_mine_leaves(); ++it)
+            {
+                // lnumber of particles
+                nb += (*it)->storage().size();
+            }
+            return nb;
+        }
+        /// @brief reset all outputs in particle structure
+        ///
+        /// @return
+        inline auto reset_particles()
         {
-            // loop on group of leaves
             for(auto pg: m_group_of_leaves)
             {
-                pg->storage().reset_particles();
+                // loop on leaves
+                for(auto& leaf: pg->block())
+                {
+                    leaf.particles().clear();
+                }
             }
         }
 
diff --git a/include/scalfmm/tree/io.hpp b/include/scalfmm/tree/io.hpp
index 64753b5cfbb3d35f28424e2430da3b54e706584a..2a555856614177f6b71ea4524bb50d96f76e06ac 100644
--- a/include/scalfmm/tree/io.hpp
+++ b/include/scalfmm/tree/io.hpp
@@ -106,17 +106,17 @@ namespace scalfmm::io
     template<typename TreeType>
     inline auto trace(std::ostream& os, const TreeType& tree, const std::size_t level_trace = 0) -> void
     {
-        std::cout << "Trace of the group tree\n";
+        std::cout << "Trace of the tree\n";
 
         auto level_0 = []() {};
 
         auto level_1 = [&tree, &os]()
         {
-            os << "group_tree | height = " << tree.height() << '\n';
-            os << "group_tree | order =  " << tree.order() << '\n';
-            os << "group_tree | Blocking group size for leaves = " << tree.group_of_leaf_size() << '\n';
-            os << "group_tree | Blocking group size for cells =  " << tree.group_of_cell_size() << '\n';
-            os << "group_tree | number of leaves group =         " << tree.leaf_groups_size() << '\n';
+            os << "group_tree | height = " << tree.height() << '\n' << std::flush;
+            os << "group_tree | order =  " << tree.order() << '\n' << std::flush;
+            os << "group_tree | Blocking group size for leaves = " << tree.group_of_leaf_size() << '\n' << std::flush;
+            os << "group_tree | Blocking group size for cells =  " << tree.group_of_cell_size() << '\n' << std::flush;
+            os << "group_tree | number of leaves group =         " << tree.leaf_groups_size() << '\n' << std::flush;
             auto cell_level_it = tree.cbegin_cells() + (tree.height() - 1);
 
             int top_level = tree.box().is_periodic() ? 0 : 2;
@@ -125,7 +125,8 @@ namespace scalfmm::io
                 auto group_of_cell_begin = std::cbegin(*(cell_level_it));
                 auto group_of_cell_end = std::cend(*(cell_level_it));
                 os << "group_tree | number of cells group (" << level
-                   << ")= " << std::distance(group_of_cell_begin, group_of_cell_end) << '\n';
+                   << ")= " << std::distance(group_of_cell_begin, group_of_cell_end) << '\n'
+                   << std::flush;
             }
         };
 
@@ -133,9 +134,9 @@ namespace scalfmm::io
         {
             auto tree_height = tree.height();
             std::size_t id_group{0};
-            os << "======================================================================\n";
-            os << "========== leaf level : " << tree_height - 1 << " ============================\n";
-            os << tree.group_of_leaf_size() << " groups at leaf level.\n";
+            os << "======================================================================\n" << std::flush;
+            os << "========== leaf level : " << tree_height - 1 << " ============================\n" << std::flush;
+            os << tree.group_of_leaf_size() << " groups at leaf level.\n" << std::flush;
 
             std::for_each(tree.cbegin_leaves(), tree.cend_leaves(),
                           //    std::cbegin(m_group_of_leaf), std::cend(m_group_of_leaf),
@@ -154,8 +155,8 @@ namespace scalfmm::io
                                                   { os << leaf.index() << "(" << leaf.size() << ") "; });
                               os << std::endl;
                           });
-            os << "======================================================================\n";
-            os << "======================================================================\n";
+            os << "======================================================================\n" << std::flush;
+            os << "======================================================================\n" << std::flush;
 
             //    auto cell_level_it = std::cbegin(m_group_of_cell_per_level) + (tree_height - 1);
             auto cell_level_it = tree.cbegin_cells() + (tree_height - 1);
@@ -179,7 +180,9 @@ namespace scalfmm::io
                                   os << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
                                   os << "    group size:  " << current_group_symbolics.number_of_component_in_group
                                      << ", ";
-                                  os << "global index =  " << current_group_symbolics.idx_global << " \n";
+                                  os << "global index =  " << current_group_symbolics.idx_global << " \n"
+                                     << " ref: depend(multi)=" << &ptr_group->ccomponent(0).cmultipoles(0)
+                                     << " rf depend(locals)=" << &ptr_group->ccomponent(0).clocals(0) << " \n";
                                   os << "    index: ";
                                   component::for_each(std::begin(*ptr_group), std::end(*ptr_group),
                                                       [&os](auto& cell) { os << cell.index() << " "; });
@@ -241,14 +244,14 @@ namespace scalfmm::io
             auto tree_height = tree.height();
 
             std::size_t id_group{0};
-            os << "========================== M2L interaction list ========================= \n";
+            os << "========================== M2L interaction list ========================= \n" << std::flush;
 
             auto cell_level_it = tree.cbegin_cells() + (tree_height - 1);
             id_group = 0;
             int top_level = tree.box().is_periodic() ? 0 : 2;
             for(int level = int(tree_height) - 1; level >= top_level; --level)
             {
-                os << "========== level : " << level << " ============================\n";
+                os << "========== level : " << level << " ============================\n" << std::flush;
                 auto group_of_cell_begin = std::cbegin(*(cell_level_it));
                 auto group_of_cell_end = std::cend(*(cell_level_it));
                 std::for_each(
@@ -258,27 +261,29 @@ namespace scalfmm::io
                       auto const& current_group_symbolics = ptr_group->csymbolics();
                       os << "*** Group of cell index " << ++id_group << " *** index in ["
                          << current_group_symbolics.starting_index << ", " << current_group_symbolics.ending_index
-                         << "[";
-                      os << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n";
-                      os << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", ";
+                         << "[" << std::flush;
+                      os << ", is_mine: " << std::boolalpha << current_group_symbolics.is_mine << "\n" << std::flush;
+                      os << "    group size:  " << current_group_symbolics.number_of_component_in_group << ", "
+                         << std::flush;
                       os << "global index =  " << current_group_symbolics.idx_global << " \n"
                          << " ref: depend(multi)=" << &ptr_group->ccomponent(0).cmultipoles(0)
-                         << " rf depend(locals)=" << &ptr_group->ccomponent(0).clocals(0) << " \n";
-                      os << "    index: \n";
+                         << " rf depend(locals)=" << &ptr_group->ccomponent(0).clocals(0) << " \n"
+                         << std::flush;
+                      os << "    index: \n" << std::flush;
                       int cpt = 0;
                       component::for_each(std::begin(*ptr_group), std::end(*ptr_group),
                                           [&cpt, &os](auto& cell)
                                           {
                                               auto& cell_symbolics = cell.symbolics();
                                               os << "       " << cpt++ << "  " << cell.index() << "  m2l_list ("
-                                                 << cell_symbolics.existing_neighbors << "): ";
+                                                 << cell_symbolics.existing_neighbors << "): " << std::flush;
                                               // get the m2l interaction list
                                               auto index = cell_symbolics.interaction_iterators;
                                               for(std::size_t idx = 0; idx < cell_symbolics.existing_neighbors; ++idx)
                                               {
                                                   os << index[idx]->index() << " ";
                                               }
-                                              os << std::endl;
+                                              os << std::endl << std::flush;
                                           });
                       os << std::endl;
                   });
diff --git a/include/scalfmm/tree/utils.hpp b/include/scalfmm/tree/utils.hpp
index f91229e48da054bdca4fc48618f3767c5221f1d1..02746330ad1ffce69e34a09b646e4ee6ccacacdf 100644
--- a/include/scalfmm/tree/utils.hpp
+++ b/include/scalfmm/tree/utils.hpp
@@ -5,6 +5,13 @@
 #ifndef SCALFMM_TREE_UTILS_HPP
 #define SCALFMM_TREE_UTILS_HPP
 
+#include <array>
+#include <fstream>
+#include <iostream>
+#include <iterator>
+#include <tuple>
+#include <type_traits>
+
 #include "scalfmm/container/point.hpp"
 #include "scalfmm/meta/traits.hpp"
 #include "scalfmm/meta/utils.hpp"
@@ -13,13 +20,6 @@
 #include "scalfmm/utils/massert.hpp"
 #include "scalfmm/utils/math.hpp"
 
-#include <array>
-#include <fstream>
-#include <iostream>
-#include <iterator>
-#include <tuple>
-#include <type_traits>
-
 // namespace scalfmm::utils
 namespace scalfmm::index
 {
diff --git a/include/scalfmm/utils/compare_results.hpp b/include/scalfmm/utils/compare_results.hpp
index 5e43bde84aee1051da8b6b57d9176014af6f4a1b..4302a33988d1e2d5e50f6651c697f6775b6520bb 100644
--- a/include/scalfmm/utils/compare_results.hpp
+++ b/include/scalfmm/utils/compare_results.hpp
@@ -22,16 +22,20 @@ namespace scalfmm
     namespace utils
     {
         /**
-         * @brief
+         * @brief compare two arrays 
+         * 
+         * An array contains the position, the inputs and the outputs. The number of values (nb_values)
+         *  per particle is array1.size() / nbParticles and the 
+         *  index1_to_compare and index2_to_compare is between [0, nb_values[
          *
          * @tparam ArrayType
          * @param tag
          * @param dimension
          * @param nbParticles
-         * @param index1_to_compare
+         * @param index1_to_compare. 
          * @param index2_to_compare
-         * @param array1
-         * @param array2
+         * @param array1.  first array 
+         * @param array2.  second array 
          */
         template<class ArrayType>
         auto compare_two_arrays(const std::string& tag, const int dimension, const std::size_t& nbParticles,
diff --git a/include/scalfmm/utils/compare_trees.hpp b/include/scalfmm/utils/compare_trees.hpp
index 3b4318ac705157bdafc47efc9e2b0172165f8b89..44982ca2045de42cefcc2f7cceec6a67e70291cc 100644
--- a/include/scalfmm/utils/compare_trees.hpp
+++ b/include/scalfmm/utils/compare_trees.hpp
@@ -1,39 +1,32 @@
-// --------------------------------
-// See LICENCE file at project root
-// File : scalfmm/utils/compare_trees.hpp
-// --------------------------------
 #pragma once
+#include <iostream>
 
-#include "scalfmm/utils/io_helpers.hpp"
-
-#include "xtensor-blas/xblas.hpp"
+#include <xtensor-blas/xblas.hpp>
 
-#include <iostream>
+#include <scalfmm/utils/io_helpers.hpp>
 
 namespace scalfmm::utils
 {
+    /////////////////////////////////
+    ///
     /**
      * @brief compare the cells of two trees
      *
      *  For all levels, depending on the option, we compare the multipole and local tensors. We calculate the
-     * frobenius norm of the error between the two tensors. If this norm is smaller than eps, then the test is true.
+     * Frobenius norm of the error between the two tensors. If this norm is smaller than eps, then the test is true.
      *
      *  option 1 only the multipoles
      *  option 2 only the locals
      *  option 3 both multipoles and locals
      *
-     * @tparam TreeType1
-     * @tparam TreeType2
-     * @tparam ValueType
      * @param tree1 first tree
      * @param tree2 second tree
      * @param eps the threshold
      * @param option int (1,2,3) -the option describe above
      * @return the comparaison
      */
-    template<typename TreeType1, typename TreeType2, typename ValueType>
-    inline auto compare_two_trees(TreeType1 const& tree1, TreeType2 const& tree2, ValueType const eps,
-                                  int option) -> bool
+    template<typename Tree1, typename Tree2, typename Value_type>
+    auto inline compare_two_trees(Tree1 const& tree1, Tree2 const& tree2, Value_type const eps, int option) -> bool
     {
         bool check{true}, check_mul{true}, check_loc{true};
 
@@ -117,7 +110,9 @@ namespace scalfmm::utils
                     // locals
                     if(option != 1)
                     {
-                        std::cout << "check locals" << std::endl;
+                        std::cout << "check locals "
+                                  << " level " << level << " Cell morton " << cell1.csymbolics().morton_index;
+                        // << " error " << error << std::endl;
                         auto const& locals1 = cell1.clocals();
                         auto const& locals2 = cell2.clocals();
                         auto number_of_arrays = locals1.size();
@@ -130,21 +125,21 @@ namespace scalfmm::utils
                             diff.reshape({diff.size()});
                             auto error = xt::linalg::norm(diff);
                             check = (error < eps);
-
                             // std::cout << "diff\n" << diff << std::endl;
+                            std::cerr << " is good? " << std::boolalpha << check << std::endl;
                             if(not check)
                             {
-                                std::cerr << "level " << level << " Cell morton " << cell1.csymbolics().morton_index
-                                          << " error " << error << std::endl;
-                                std::cerr << "local1(" << l << ")\n" << local1 << std::endl;
-                                std::cerr << "local2(" << l << ")\n" << local2 << std::endl;
+                                std::cerr << "   error " << error << std::endl;
+                                std::cerr << "   local1(" << l << ")\n" << local1 << std::endl;
+                                std::cerr << "   local2(" << l << ")\n" << local2 << std::endl;
+                                std::cerr << "   diff(" << l << ")\n" << diff << std::endl;
                                 check_loc = false;
                                 check_level = false;
                             }
                         }
                     }   // end option
-                }   // end cells
-            }   // end groups
+                }       // end cells
+            }           // end groups
             if(check_level)
             {
                 std::cout << "level: " << level << " is good !\n";
diff --git a/include/scalfmm/utils/math.hpp b/include/scalfmm/utils/math.hpp
index 0199da1f6150349d2f0cf74b857b26a4a0132ae2..6d5f6b089299c74836637710463890d19e3ba64d 100644
--- a/include/scalfmm/utils/math.hpp
+++ b/include/scalfmm/utils/math.hpp
@@ -13,11 +13,13 @@
 namespace scalfmm::math
 {
     /**
-     * @brief
+     * @brief compute the factoriel  of value
      *
      * @tparam T
+     * 
      * @param value
-     * @return T
+     * 
+     * @return T value!
      */
     template<typename T>
     inline auto factorial(int value) -> T
@@ -34,15 +36,20 @@ namespace scalfmm::math
         }
         return T(result);
     }
-
     /**
-     * @brief
+     * @brief Compute a^p when p is an integer (meta function)
      *
      * @tparam T
      * @param a
      * @param p
      * @return T
      */
+    template<typename T>
+    inline constexpr auto pow(const T a, const std::size_t p) -> T
+    {
+        return p == 0 ? 1 : a * pow<T>(a, p - 1);
+    }
+
     template<typename T>
     inline auto pow(T a, int p) -> T
     {
@@ -55,34 +62,14 @@ namespace scalfmm::math
     }
 
     /**
-     * @brief
-     *
-     * @tparam T
-     * @param a
-     * @param p
-     * @return T
-     */
-    template<typename T>
-    inline constexpr auto pow(T a, std::size_t p) -> T
-    {
-        return p == 0 ? 1 : a * pow<T>(a, p - 1);
-    }
-
-    /**
-     * @brief
+     * @brief Check if |a b| < epsilon for two floating_point
      *
      * @tparam T
      * @tparam U
-     * @tparam typename
-     * @tparam T>
-     * @tparam T>,
-     * typename
-     * @tparam U>
      * @param a
      * @param b
-     * @param epsilon
-     * @return true
-     * @return false
+     * @param epsilon the threshold
+     * @return 
      */
     template<typename T, typename U, typename = std::enable_if_t<std::is_floating_point<T>::value, T>,
              typename = std::enable_if_t<std::is_floating_point<U>::value, U>>
@@ -92,15 +79,15 @@ namespace scalfmm::math
     }
 
     /**
-     * @brief
+     * @brief return true if value in [range_begin, range_end[
      *
-     * @tparam ValueType1
-     * @tparam ValueType
+     * @tparam ValueType1 for value
+     * @tparam ValueType for the interval
      * @param value
      * @param range_begin
      * @param range_end
-     * @return true
-     * @return false
+     * @return return true if value in [range_begin, range_end[
+     *
      */
     template<typename ValueType1, typename ValueType>
     inline constexpr auto between(ValueType1 value, ValueType range_begin, ValueType range_end) -> bool
diff --git a/include/scalfmm/utils/parameters.hpp b/include/scalfmm/utils/parameters.hpp
index 68c48dda92ae62ea4baf1675649b3a25173f329c..c5077e491db0b9453d0bbf5dd0cfc1adaf7a6848 100644
--- a/include/scalfmm/utils/parameters.hpp
+++ b/include/scalfmm/utils/parameters.hpp
@@ -65,6 +65,7 @@ namespace args
         cpp_tools::cl_parser::str_vec flags = {"--output-file", "-fout"};
         std::string description = "Output particle file (with extension .fma (ascii) or bfma (binary).";
         using type = std::string;
+        type def = "output.fma";
     };
 
     /**
diff --git a/include/scalfmm/utils/sort.hpp b/include/scalfmm/utils/sort.hpp
index d563a0bfecb05aed6a240f07f13f57c8e5a2a5dc..0d9fba59206fb755939a136d8e8e9ef6643898f6 100644
--- a/include/scalfmm/utils/sort.hpp
+++ b/include/scalfmm/utils/sort.hpp
@@ -264,7 +264,7 @@ namespace scalfmm::utils
         std::copy(array.begin(), array.end(), tmp_array);
         constexpr static const std::size_t dimension = points_type::dimension;
         //
-        const std::size_t max_level = 2;   //sizeof(morton_type) * 8 / dimension - 1;
+        const std::size_t max_level = sizeof(morton_type) * 8 / dimension - 1;
         using pair_type = std::pair<morton_type, int>;
         std::vector<pair_type> tosort(nbParticles);
 #pragma omp parallel for shared(tosort, nbParticles, box, max_level, array)
@@ -275,7 +275,7 @@ namespace scalfmm::utils
             tosort[i].second = i;
         }
 
-        std::sort(tosort.begin(), tosort.end(), [&](pair_type& a, pair_type& b) { return (a.first > b.first); });
+        std::sort(tosort.begin(), tosort.end(), [&](pair_type& a, pair_type& b) { return (a.first < b.first); });
 
         //
         // We fill the sorted array
diff --git a/modules/internal/cpp_tools b/modules/internal/cpp_tools
index 8358a544112a2ec6ee72d0a72a37de672b01a310..c67bb86b393d31dff8758fd20a6d87f9f6b8120d 160000
--- a/modules/internal/cpp_tools
+++ b/modules/internal/cpp_tools
@@ -1 +1 @@
-Subproject commit 8358a544112a2ec6ee72d0a72a37de672b01a310
+Subproject commit c67bb86b393d31dff8758fd20a6d87f9f6b8120d
diff --git a/tools/compare_files.cpp b/tools/compare_files.cpp
index 34f2caf1480247bd172b8dd6874bff73cf11b43b..dd46fea031add09a7270ca58d533c965af545e2a 100644
--- a/tools/compare_files.cpp
+++ b/tools/compare_files.cpp
@@ -5,14 +5,18 @@
  *      Author: Olivier Coulaud
  */
 
-#include <cstdlib>
 #include <fstream>
 #include <iostream>
 #include <string>
+#include <vector>
 //
+#include "scalfmm/meta/traits.hpp"
 #include "scalfmm/tools/fma_loader.hpp"
 #include "scalfmm/tree/box.hpp"
 #include "scalfmm/utils/compare_results.hpp"
+#include "scalfmm/utils/sort.hpp"
+
+#include "scalfmm/utils/parameters.hpp"
 
 #include <cpp_tools/cl_parser/cl_parser.hpp>
 #include <cpp_tools/colors/colorized.hpp>
@@ -64,9 +68,7 @@
 //!  \endcode
 
 using value_type = double;
-constexpr int dimension = 3;
-using position_type = scalfmm::container::point<double, dimension>;
-using box_type = scalfmm::component::box<position_type>;
+
 ///
 /////////////////////////////////////////////////////////////
 ///          Local parameters
@@ -165,40 +167,17 @@ namespace local_args
         };
     };
 }   // namespace local_args
-/////////////////////////////////////////////////////////////
-
-auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
+///////////////////////////////////////////////////////////**
+template<int Dimension>   //, typename VectorType>
+auto cmp(std::string const& file1, std::string const& file2, bool sort_particle, std::vector<int> const& index1,
+         std::vector<int> const& index2) -> int
 {
-    /// Parsing options
-    ///
-    auto parser = cpp_tools::cl_parser::make_parser(cpp_tools::cl_parser::help{}, local_args::input_file_one(),
-                                                    local_args::input_file_two(), local_args::sort_particle(),
-                                                    local_args::index_to_compare(), local_args::index2_to_compare());
-
-    // Parameter handling
-    parser.parse(argc, argv);
-
-    std::cout << cpp_tools::colors::blue << "Entering sort_particles...\n" << cpp_tools::colors::reset;
-
-    const auto filename1{parser.get<local_args::input_file_one>()};
-    if(!filename1.empty())
-    {
-        std::cout << cpp_tools::colors::blue << "<params> Input file 1: " << filename1 << cpp_tools::colors::reset
-                  << '\n';
-    }
-
-    const auto filename2{parser.get<local_args::input_file_two>()};
-    if(!filename2.empty())
-    {
-        std::cout << cpp_tools::colors::blue << "<params> Input file 2: " << filename2 << cpp_tools::colors::reset
-                  << '\n';
-    }
-    std::vector<int> index;
-    index = parser.get<local_args::index_to_compare>();
+    using position_type = scalfmm::container::point<value_type, Dimension>;
+    using box_type = scalfmm::component::box<position_type>;
     bool verbose = true;
 
-    scalfmm::io::FFmaGenericLoader<value_type, dimension> loader1(filename1, verbose);
-    scalfmm::io::FFmaGenericLoader<value_type, dimension> loader2(filename2, verbose);
+    scalfmm::io::FFmaGenericLoader<value_type, Dimension> loader1(file1, verbose);
+    scalfmm::io::FFmaGenericLoader<value_type, Dimension> loader2(file2, verbose);
 
     //
     // Allocation
@@ -224,37 +203,90 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
     loader1.fillParticles(particles1, nb_particles);
     loader2.fillParticles(particles2, nb_particles);
 
-    if(parser.exists<local_args::sort_particle>())
+    if(sort_particle)
     {
         // define a box, used in the sort
-
+        using morton_type = std::size_t;
+        const std::size_t max_level = (sizeof(morton_type) * 8 / Dimension) - 1;
+        //
         box_type box(loader1.getBoxWidth(), loader1.getCenterOfBox());
 
         std::cout << "Sort needed !! " << std::endl;
         scalfmm::utils::sort_raw_array_with_morton_index(box, nb_particles, particles1);
+
         scalfmm::utils::sort_raw_array_with_morton_index(box, nb_particles, particles2);
     }
+    // scalfmm::meta::td<decltype(index1)> u;
+    scalfmm::utils::compare_two_arrays("TAG ", Dimension, nb_particles, index1, index2, particles1, particles2);
+    return 0;
+}
+//
+//////////////////////////////////////////////////////////////////////////////
+//
+auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
+{
+    /// Parsing options
+    ///
+    auto parser = cpp_tools::cl_parser::make_parser(
+      cpp_tools::cl_parser::help{}, args::Dimension{}, local_args::input_file_one(), local_args::input_file_two(),
+      local_args::sort_particle(), local_args::index_to_compare(), local_args::index2_to_compare());
+
+    // Parameter handling
+    parser.parse(argc, argv);
+
+    std::cout << cpp_tools::colors::blue << "Entering sort_particles...\n" << cpp_tools::colors::reset;
+    const int dimension{parser.get<args::Dimension>()};
+    const auto filename1{parser.get<local_args::input_file_one>()};
+    if(!filename1.empty())
+    {
+        std::cout << cpp_tools::colors::blue << "<params> Input file 1: " << filename1 << cpp_tools::colors::reset
+                  << '\n';
+    }
 
-    if(parser.exists<local_args::index2_to_compare>())
+    const auto filename2{parser.get<local_args::input_file_two>()};
+    if(!filename2.empty())
+    {
+        std::cout << cpp_tools::colors::blue << "<params> Input file 2: " << filename2 << cpp_tools::colors::reset
+                  << '\n';
+    }
+    std::vector<int> index;
+    index = parser.get<local_args::index_to_compare>();
+
+    // scalfmm::meta::td<decltype(ind ex)> u1;
+
+    bool sort_particle{parser.exists<local_args::sort_particle>()};
+    bool index2_exist{parser.exists<local_args::index2_to_compare>()};
+    std::vector<int> index2;
+
+    if(index2_exist)
     {
-        std::vector<int> index2;
         index2 = parser.get<local_args::index2_to_compare>();
         if(index2.size() != index.size())
         {
             std::cerr << "Wrong number of index between file1 and file2" << std::endl;
             std::exit(EXIT_FAILURE);
         }
-        scalfmm::utils::compare_two_arrays("TAG ", dimension, nb_particles, index, index2, particles1, particles2);
     }
     else
     {
-        scalfmm::utils::compare_two_arrays("TAG ", dimension, nb_particles, index, index, particles1, particles2);
-
-        //
+        index2 = index;
     }
-    //  auto error = compareTwoArrays<FReal, FmaRWParticle<FReal,8,8>* >("TAG", nbParticles, particles1, particles2);
 
-    //
-    //   return int(error);
-    return 0;
+    if(dimension == 1)
+    {
+        return cmp<1>(filename1, filename2, sort_particle, index, index2);
+    }
+    else if(dimension == 2)
+    {
+        return cmp<2>(filename1, filename2, sort_particle, index, index2);
+    }
+    else if(dimension == 3)
+    {
+        return cmp<3>(filename1, filename2, sort_particle, index, index2);
+    }
+    else
+    {
+        throw std::invalid_argument("The dimension is wrong (1,2 or 3)");
+        return 1;
+    }
 }
diff --git a/units/fmm/count_kernel_mpi.cpp b/units/fmm/count_kernel_mpi.cpp
index b3f23674fda14dca9677ae7dd3e093a071b61f76..2b561ffcb49ef9000018ca8039e1fbacc5b42be8 100644
--- a/units/fmm/count_kernel_mpi.cpp
+++ b/units/fmm/count_kernel_mpi.cpp
@@ -2,4 +2,46 @@
 //
 // Units for test fmm
 // ----------------------
+
+#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+
+static cpp_tools::parallel_manager::parallel_manager para;
+
 #include "units_count_kernel_mpi_gen.hpp"
+
+//auto run(const int& tree_height, const int& group_size,  std::string& input_file,  bool use_leaf_distribution, bool mutual = false) -> int{
+
+TEMPLATE_TEST_CASE("test count 2d", "[test-count-2d]", double)
+{
+    // leaf distrubution and not mutual
+    SECTION("count 2d", "[count2d]")
+    {
+        run_count_kernel_mpi<2, double>(4, 10, path + "test_2d_ref.fma", true, false);
+    }   // h = 5
+}
+
+TEMPLATE_TEST_CASE("test count 3d", "[test-count-3d]", double)
+{
+    // leaf distribution and mutual
+    SECTION("count 3d", "[count3d]")
+    {
+        run_count_kernel_mpi<3, double>(5, 40, path + "sphere-706_source.fma", true, true);
+    }   // h = 5
+}
+
+TEMPLATE_TEST_CASE("test count 3d", "[test-count-3d]", float)
+{
+    // leaf distribution and mutual
+    SECTION("count 3d", "[count3d]")
+    {
+        run_count_kernel_mpi<3, float>(5, 40, path + "sphere-706_source.fma", true, true);
+    }   // h = 5
+}
+int main(int argc, char* argv[])
+{
+    para.init();
+    int result = Catch::Session().run(argc, argv);
+    para.end();
+
+    return result;
+}
diff --git a/units/fmm/units_count_kernel_mpi_gen.hpp.in b/units/fmm/units_count_kernel_mpi_gen.hpp.in
index 5928e79710206f7a783c3d211691ff03845b63aa..53db7b703db3c2f753c0645b105b713e7ef28b42 100644
--- a/units/fmm/units_count_kernel_mpi_gen.hpp.in
+++ b/units/fmm/units_count_kernel_mpi_gen.hpp.in
@@ -20,7 +20,7 @@
 #include "scalfmm/utils/generate.hpp"
 #include "scalfmm/utils/math.hpp"
 #include <cpp_tools/colors/colorized.hpp>
-#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+// #include <cpp_tools/parallel_manager/parallel_manager.hpp>
 
 #define CATCH_CONFIG_RUNNER
 #include <catch2/catch.hpp>
@@ -49,30 +49,28 @@ inline auto constexpr get_accumulate_shape()
 }
 
 template<int Dimension, typename value_type>
-//auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_height, const int& group_size,
-//         Array const& pbc, const int nb_level_above_root, const bool readFile, std::string& input_file,
-//         const bool interaction, bool use_leaf_distribution, bool use_particle_distribution) -> int
-auto run(const int& tree_height, const int& group_size,  std::string const && input_file,  bool use_leaf_distribution, bool mutual = false) -> int{
-
-  static constexpr std::size_t number_of_physical_values = 1;
-  //   static constexpr std::size_t dimpow2 = scalfmm::math::pow(2, Dimension);
-  const auto runtime_order = 1;
-  bool use_particle_distribution{false};
-  int level_shared{2};
-  if(!use_leaf_distribution)
-  {
-      use_particle_distribution = true ;
-  }
+auto run_count_kernel_mpi(const int& tree_height, const int& group_size, std::string const&& input_file,
+                          bool use_leaf_distribution, bool mutual = false) -> int
+{
+    static constexpr std::size_t number_of_physical_values = 1;
+    const auto runtime_order = 1;
+    bool use_particle_distribution{false};
+    int level_shared{2};
+    if(!use_leaf_distribution)
+    {
+        use_particle_distribution = true;
+    }
     //
-    cpp_tools::parallel_manager::parallel_manager para;
-    para.init();
-    
+    // cpp_tools::parallel_manager::parallel_manager para;
+    // para.init();
+
     const int rank = para.get_process_id();
     const int nproc = para.get_num_processes();
     // Parameter handling
 
     // ------------------------------------------------------------------------------
-    using Particle_type = scalfmm::container::particle<value_type, Dimension, value_type, number_of_physical_values, value_type, 1>;
+    using Particle_type =
+      scalfmm::container::particle<value_type, Dimension, value_type, number_of_physical_values, value_type, 1>;
     using container_type = scalfmm::container::particle_container<Particle_type>;
     using position_type = typename Particle_type::position_type;
     using cell_type =
@@ -87,16 +85,16 @@ auto run(const int& tree_height, const int& group_size,  std::string const && in
     point_type box_center(0.0);
     value_type box_width{1.};
     //
-      std::vector<bool> pbc(Dimension, false);
+    std::vector<bool> pbc(Dimension, false);
     int nb_level_above_root{-1};
-    
+
     container_type* container;
     std::vector<Particle_type> particles_set;
 
     std::size_t number_of_particles{};
     std::size_t local_number_of_particles{};
     {
-      bool verbose = true; //false;
+        bool verbose = true;   //false;
 
         scalfmm::io::DistFmaGenericLoader<value_type, Dimension> loader(input_file, para, verbose);
 
@@ -127,51 +125,7 @@ auto run(const int& tree_height, const int& group_size,  std::string const && in
             particles_set[idx] = p;
         }
     }
-    /*
-    else
-    {
-        // generate particles: one par leaf, the octree is full.
-        number_of_particles = std::pow(dimpow2, (tree_height - 1));
-        std::cout << "number_of_particles = " << number_of_particles << " box_width " << box_width << '\n';
-
-        auto number_of_values_per_dimension = std::size_t(scalfmm::math::pow(2, (tree_height - 1)));
-        const std::size_t bloc = number_of_particles / nproc;
-
-        local_number_of_particles = (rank < nproc - 1) ? bloc : number_of_particles - (nproc - 1) * bloc;
-        particles_set.resize(local_number_of_particles);
 
-        //
-        const std::size_t start_index{rank * bloc};
-        const std::size_t end_index{start_index + local_number_of_particles};
-        std::cout << "start_index = " << start_index << " end_index = " << end_index << '\n';
-
-        value_type step{box_width / std::pow(2, (tree_height))};
-
-        std::cout << "Number of value per dimension = " << number_of_values_per_dimension << '\n';
-        std::cout << "Step = " << step << '\n';
-
-        for(std::size_t index{start_index}, idx{0}; index < end_index; ++index, ++idx)
-        {
-            auto coord = scalfmm::index::get_coordinate_from_morton_index<Dimension>(index);
-
-            point_type pos{coord};
-            std::cout << idx << "index " << index << " coord " << coord << " centre: " << step * pos << std::endl;
-            particle_type p;
-            std::size_t ii{0};
-            for(auto& e: p.position())
-            {
-                e = -box_width * 0.5 + step * 0.5 + step * pos[ii++];
-            }
-            particles_set[idx] = p;
-        }
-    }
-    */
-    // std::cout << "pbc:    " << std::boolalpha;
-    // for(auto e: pbc)
-    // {
-    //     std::cout << e << " ";
-    // }
-    // std::cout << std::endl;
     box_type box(box_width, box_center);
 #ifdef scalfmm_BUILD_PBC
     box.set_periodicity(pbc);
@@ -226,13 +180,9 @@ auto run(const int& tree_height, const int& group_size,  std::string const && in
 
         scalfmm::io::trace(std::cout, letTree, 4);
     }
-
-    //auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
-    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::nearfield;
+    //
     auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::all;
-    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2l);
-    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2m  | scalfmm::algorithms::operators_to_proceed::m2l)  ;
-
+    //
     scalfmm::algorithms::mpi::proc_task(letTree, fmm_operator, operator_to_proceed);
     //
     std::size_t nb_particles_min = 20 * number_of_particles, nb_particles_max = 0, nb_per = 1;
@@ -273,45 +223,19 @@ auto run(const int& tree_height, const int& group_size,  std::string const && in
         std::cout << "wrong number of particles - nb particles (min) " << nb_particles_min << "  (max) "
                   << nb_particles_max << " (expected) " << number_of_particles << std::endl;
 
-        if(para.io_master())
-            std::cout << "Save Tree in parallel\n";
-        // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
-        std::string outName("saveTreeLet.bin");
-        std::string header("CHEBYSHEV LOW RANK ");
-        scalfmm::tools::io::save(para, outName, letTree, header);
+        // if(para.io_master())
+        //     std::cout << "Save Tree in parallel\n";
+        // // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
+        // std::string outName("saveTreeLet.bin");
+        // std::string header("CHEBYSHEV LOW RANK ");
+        // scalfmm::tools::io::save(para, outName, letTree, header);
     }
     std::cout << cpp_tools::colors::reset << '\n';
     REQUIRE(right_number_of_particles);
     //
-    para.end();
+    // para.end();
     //
     return right_number_of_particles;
 }
 
 //
-
-
-//auto run(const int& tree_height, const int& group_size,  std::string& input_file,  bool use_leaf_distribution, bool mutual = false) -> int{
-
-
-TEMPLATE_TEST_CASE("test count 2d", "[test-count-2d]", double)
-{
-  // leaf distrubution and not mutual
-  SECTION("count 2d", "[count2d]") { run<2,double>(4, 10, path + "test_2d_ref.fma", true, false); }   // h = 5
-}
-
-/*
-TEMPLATE_TEST_CASE("test count 3d", "[test-count-3d]", double)
-{
-  // leaf distrubution and mutual
-  SECTION("count 3d", "[count3d]") { run<3,double>(5, 40,  path +"sphere-706_source.fma", true,true); }   // h = 5
-}
-*/
-int main(int argc, char* argv[])
-{
-    int result = Catch::Session().run(argc, argv);
-
-    return result;
-}
-
-
diff --git a/units/fmm/units_count_particles_mpi_gen.hpp b/units/fmm/units_count_particles_mpi_gen.hpp
new file mode 100644
index 0000000000000000000000000000000000000000..9b8aecb2679e9ed36bb3beb9b7a1e5de54f34700
--- /dev/null
+++ b/units/fmm/units_count_particles_mpi_gen.hpp
@@ -0,0 +1,316 @@
+// @FUSE_MPI
+#include "scalfmm/algorithms/fmm.hpp"
+#include "scalfmm/algorithms/mpi/proc_task.hpp"
+//
+#include "scalfmm/container/particle.hpp"
+#include "scalfmm/container/particle_container.hpp"
+#include "scalfmm/operators/count_kernel/count_kernel.hpp"
+//
+#include "scalfmm/interpolation/grid_storage.hpp"
+#include "scalfmm/meta/utils.hpp"
+#include "scalfmm/tools/fma_dist_loader.hpp"
+#include "scalfmm/tools/tree_io.hpp"
+#include "scalfmm/tree/box.hpp"
+#include "scalfmm/tree/cell.hpp"
+#include "scalfmm/tree/dist_group_tree.hpp"
+#include "scalfmm/tree/group_let.hpp"
+#include "scalfmm/tree/leaf_view.hpp"
+#include "scalfmm/tree/utils.hpp"
+#include "scalfmm/utils/generate.hpp"
+
+#include <cpp_tools/colors/colorized.hpp>
+#include <cpp_tools/parallel_manager/parallel_manager.hpp>
+
+#define CATCH_CONFIG_RUNNER
+#include <catch2/catch.hpp>
+
+template<std::size_t dim>
+inline auto constexpr get_accumulate_shape()
+{
+    if constexpr(dim == 1)
+    {
+        return std::array<std::size_t, dim>{1};
+    }
+    if constexpr(dim == 2)
+    {
+        return std::array<std::size_t, dim>{1, 1};
+    }
+    if constexpr(dim == 3)
+    {
+        return std::array<std::size_t, dim>{1, 1, 1};
+    }
+    if constexpr(dim == 4)
+    {
+        return std::array<std::size_t, dim>{1, 1, 1, 1};
+    }
+}
+
+template<int Dimension, typename Array>
+//auto run(cpp_tools::parallel_manager::parallel_manager& para, const int& tree_height, const int& group_size,
+//         Array const& pbc, const int nb_level_above_root, const bool readFile, std::string& input_file,
+//         const bool interaction, bool use_leaf_distribution, bool use_particle_distribution) -> int
+auto run(const int& tree_height, const int& group_size,  std::string& input_file,  bool use_leaf_distribution, bool mutual = false) -> int{
+  static constexpr std::size_t number_of_physical_values = 1;
+    static constexpr std::size_t dimpow2 = pow(2, Dimension);
+    const auto runtime_order = 1;
+
+    int level_shared{2};
+    if(use_leaf_distribution) {
+      use_particle_distribution = false ;
+    }
+    else {
+      use_particle_distribution = true ;
+    }
+    //
+    const int rank = para.get_process_id();
+    const int nproc = para.get_num_processes();
+    // Parameter handling
+    if(readFile)
+    {
+        if(!input_file.empty())
+        {
+            std::cout << cpp_tools::colors::blue << "<params> Input file : " << input_file << cpp_tools::colors::reset
+                      << '\n';
+        }
+    }
+
+    // ------------------------------------------------------------------------------
+    using particle_type = scalfmm::container::particle<double, Dimension, double, number_of_physical_values, double, 1>;
+    using container_type = scalfmm::container::particle_container<particle_type>;
+    using position_type = typename particle_type::position_type;
+    using cell_type =
+      scalfmm::component::cell<scalfmm::component::grid_storage<double, Dimension, number_of_physical_values, 1>>;
+    using leaf_type = scalfmm::component::leaf_view<particle_type>;
+    using box_type = scalfmm::component::box<position_type>;
+    using group_tree_type = scalfmm::component::dist_group_tree<cell_type, leaf_type, box_type>;
+    //
+    // ------------------------------------------------------------------------------
+    //
+    using point_type = scalfmm::container::point<double, Dimension>;
+    point_type box_center(0.0);
+    double box_width{1.};
+    //
+    container_type* container;
+    std::vector<particle_type> particles_set;
+
+    std::size_t number_of_particles{};
+    std::size_t local_number_of_particles{};
+    if(readFile)   // Read particles from a file
+    {
+        bool verbose = false;
+
+        scalfmm::io::DistFmaGenericLoader<double, Dimension> loader(input_file, para, verbose);
+
+        number_of_particles = loader.getNumberOfParticles();
+        local_number_of_particles = loader.getMyNumberOfParticles();
+        number_of_particles = loader.getNumberOfParticles();
+        box_width = loader.getBoxWidth();
+        box_center = loader.getBoxCenter();
+
+        auto nb_val_to_red_per_part = loader.get_dimension() + loader.get_number_of_input_per_record();
+        double* values_to_read = new double[nb_val_to_red_per_part]{};
+        container = new container_type(local_number_of_particles);
+        particles_set.resize(local_number_of_particles);
+        for(std::size_t idx = 0; idx < local_number_of_particles; ++idx)
+        {
+            loader.fillParticle(values_to_read, nb_val_to_red_per_part);
+            particle_type p;
+            std::size_t ii{0};
+            for(auto& e: p.position())
+            {
+                e = values_to_read[ii++];
+            }
+            for(auto& e: p.inputs())
+            {
+                e = values_to_read[ii++];
+            }
+            // container->insert_particle(idx, p);
+            particles_set[idx] = p;
+        }
+    }
+    else
+    {
+        // generate particles: one par leaf, the octree is full.
+        number_of_particles = std::pow(dimpow2, (tree_height - 1));
+        std::cout << "number_of_particles = " << number_of_particles << " box_width " << box_width << '\n';
+
+        auto number_of_values_per_dimension = std::size_t(scalfmm::math::pow(2, (tree_height - 1)));
+        const std::size_t bloc = number_of_particles / nproc;
+
+        local_number_of_particles = (rank < nproc - 1) ? bloc : number_of_particles - (nproc - 1) * bloc;
+        particles_set.resize(local_number_of_particles);
+
+        //
+        const std::size_t start_index{rank * bloc};
+        const std::size_t end_index{start_index + local_number_of_particles};
+        std::cout << "start_index = " << start_index << " end_index = " << end_index << '\n';
+
+        double step{box_width / std::pow(2, (tree_height))};
+
+        std::cout << "Number of value per dimension = " << number_of_values_per_dimension << '\n';
+        std::cout << "Step = " << step << '\n';
+
+        for(std::size_t index{start_index}, idx{0}; index < end_index; ++index, ++idx)
+        {
+            auto coord = scalfmm::index::get_coordinate_from_morton_index<Dimension>(index);
+
+            point_type pos{coord};
+            std::cout << idx << "index " << index << " coord " << coord << " centre: " << step * pos << std::endl;
+            particle_type p;
+            std::size_t ii{0};
+            for(auto& e: p.position())
+            {
+                e = -box_width * 0.5 + step * 0.5 + step * pos[ii++];
+            }
+            particles_set[idx] = p;
+        }
+    }
+    // std::cout << "pbc:    " << std::boolalpha;
+    // for(auto e: pbc)
+    // {
+    //     std::cout << e << " ";
+    // }
+    // std::cout << std::endl;
+    box_type box(box_width, box_center);
+#ifdef SCALFMM_BUILD_PBC
+    box.set_periodicity(pbc);
+#endif
+    std::cout << "Box: " << box << std::endl;
+    ///////////////////////////////////////////////////////////////////////////////////////////////////////
+    ///    Set particles in the tree and construct the let
+    ///  1) sort the particles according to their Morton index
+    ///  2) construct the tree, then the let
+    ///
+    const int leaf_level = tree_height - 1;
+    // separation criteria used to construct M2L | P2P ghosts
+    int separation = 1;
+    // Construct the LET
+    auto letTree = scalfmm::tree::let::buildLetTree<group_tree_type>(
+      para, number_of_particles, particles_set, box, leaf_level, level_shared, group_size, group_size, runtime_order,
+      separation, use_leaf_distribution, use_particle_distribution);
+
+    //    if(para.io_master())
+    {
+        std::cout << cpp_tools::colors::blue << "Print tree distribution\n";
+        letTree.print_distrib(std::cout);
+        std::cout << cpp_tools::colors::reset;
+    }
+
+#ifdef SCALFMM_BUILD_PBC
+    std::cerr << cpp_tools::color::red << "Doesn't work with PBC \n" << cpp_tools::color::reset;
+    letTree.set_levels_above_root(nb_level_above_root);
+#endif
+    //
+    ///////////////////////////////////
+    // using fmm_operator_type = count_kernels::particles::count_fmm_operator<Dimension>;
+    // // fmm_operator_type fmm_operator{};
+    using fmm_operator_type = scalfmm::operators::fmm_operators<count_kernels::particles::count_near_field,
+                                                                count_kernels::particles::count_far_field<Dimension>>;
+    bool mutual = false;
+    int const& separation_criterion = separation;   // fmm_operator.near_field().separation_criterion();
+
+    count_kernels::particles::count_near_field nf(mutual);
+    count_kernels::particles::count_far_field<Dimension> ff{};
+    fmm_operator_type fmm_operator(nf, ff);
+    std::cout << cpp_tools::colors::red << "build_interaction_lists \n" << cpp_tools::colors::reset << std::flush;
+
+    scalfmm::list::sequential::build_interaction_lists(letTree, letTree, separation_criterion, mutual);
+    std::cout << cpp_tools::colors::red << "trace \n" << cpp_tools::colors::reset << std::flush;
+    // if(para.io_master())
+    {
+        std::cout << cpp_tools::colors::red << "trace  2\n" << cpp_tools::colors::reset << std::flush;
+
+        scalfmm::io::trace(std::cout, letTree, 2);
+        std::cout << cpp_tools::colors::red << "trace  4\n" << cpp_tools::colors::reset << std::flush;
+
+        scalfmm::io::trace(std::cout, letTree, 4);
+    }
+
+    //auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::farfield;
+    // auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::nearfield;
+    auto operator_to_proceed = scalfmm::algorithms::operators_to_proceed::all;
+    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2l);
+    //	 auto operator_to_proceed = (scalfmm::algorithms::operators_to_proceed::p2m | scalfmm::algorithms::operators_to_proceed::m2m  | scalfmm::algorithms::operators_to_proceed::m2l)  ;
+
+    scalfmm::algorithms::mpi::proc_task(letTree, fmm_operator, operator_to_proceed);
+    //
+    std::size_t nb_particles_min = 20 * number_of_particles, nb_particles_max = 0, nb_per = 1;
+    bool right_number_of_particles = true;
+    int nb_part_above = 1;
+    // Construct the total number of particles
+    for(int d = 0; d < Dimension; ++d)
+    {
+        if(pbc[d])
+        {
+            nb_per *= 3;
+            nb_part_above *= std::pow(2, nb_level_above_root + 1);
+        }
+    }
+    number_of_particles *= nb_per;
+    number_of_particles *= nb_part_above;
+    scalfmm::component::for_each_mine_leaf(
+      letTree.begin_mine_leaves(), letTree.end_mine_leaves(),
+      [&right_number_of_particles, number_of_particles, &nb_particles_max, &nb_particles_min](auto const& leaf)
+      {
+          size_t nb_part = std::get<0>(*scalfmm::container::outputs_begin(leaf.particles()));
+          nb_particles_max = std::max(nb_particles_max, nb_part);
+          nb_particles_min = std::min(nb_particles_min, nb_part);
+          if(nb_part != number_of_particles)
+          {
+              std::cout << cpp_tools::colors::red << "wrong number of particles - index " << leaf.index()
+                        << " nb particles " << nb_part << std::endl;
+              right_number_of_particles = false;
+          }
+      });
+    std::cout << cpp_tools::colors::reset << '\n';
+    if(right_number_of_particles)
+    {
+        std::cout << "Found the right number of particles - nb particles " << number_of_particles << std::endl;
+    }
+    else
+    {
+        std::cout << "wrong number of particles - nb particles (min) " << nb_particles_min << "  (max) "
+                  << nb_particles_max << " (expected) " << number_of_particles << std::endl;
+
+        if(para.io_master())
+            std::cout << "Save Tree in parallel\n";
+        // std::string outName("saveTree_" + std::to_string(rank) + ".bin");
+        std::string outName("saveTreeLet.bin");
+        std::string header("CHEBYSHEV LOW RANK ");
+        scalfmm::tools::io::save(para, outName, letTree, header);
+    }
+    std::cout << cpp_tools::colors::reset << '\n';
+    REQUIRE(right_number_of_particles);
+
+    return right_number_of_particles;
+}
+
+//
+
+
+//auto run(const int& tree_height, const int& group_size,  std::string& input_file,  bool use_leaf_distribution, bool mutual = false) -> int{
+/*
+TEMPLATE_TEST_CASE("test count 1d", "[test-count-1d]", double)
+{
+    SECTION("count 1d", "[count1d]") { run<1>(4, 10,  "../data/units/test_2d_ref.fma", false); }   // h = 5
+}
+*/
+TEMPLATE_TEST_CASE("test count 2d", "[test-count-2d]", double)
+{
+  SECTION("count 2d", "[count2d]") { run<2>(3, 10, "../data/units/test_2d_ref.fma", false); }   // h = 5
+}
+
+TEMPLATE_TEST_CASE("test count 3d", "[test-count-3d]", double)
+{
+  // leaf distrubution and mutual
+  SECTION("count 3d", "[count3d]") { run<3>(5, 40, "../data/units/sphere-706_source.fma", true,true); }   // h = 5
+}
+
+int main(int argc, char* argv[])
+{
+    int result = Catch::Session().run(argc, argv);
+
+    return result;
+}
+
+