diff --git a/bench/fmm_generic.hpp b/bench/fmm_generic.hpp
index 5dafcffcae69dc6ee0911ec9c984b9b552dd3c7d..75a9a31d32642ebff9fca9c5095d50e8e65d41b4 100644
--- a/bench/fmm_generic.hpp
+++ b/bench/fmm_generic.hpp
@@ -380,9 +380,9 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int
 
     if(which_interp == 0)
     {
-        using options_uniform = scalfmm::options::uniform_<scalfmm::options::dense_>;
+        using options_uniform = scalfmm::options::uniform_<scalfmm::options::fft_>;
 
-        std::cout << cpp_tools::colors::blue << "<params> Interpolator: Uniform low rank" << cpp_tools::colors::reset
+        std::cout << cpp_tools::colors::blue << "<params> Interpolator: Uniform fft" << cpp_tools::colors::reset
                   << '\n';
         if(mutual)
         {
diff --git a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
index 1b4a9e8aa285e0b4d3666117e626b6b226861ef3..104fb34a7205e7c7a8884c4c8c234f8cc86698e9 100644
--- a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
+++ b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp
@@ -479,8 +479,15 @@ namespace scalfmm::interpolation
             }
         }
 
-        // Preprocessing function for applying ffts on multipoles
-        // This function is called as soon as the multipoles are computed and available
+        /**
+         * @brief Preprocessing function for applying ffts on multipole arrays
+         *
+         * This function is called as soon as the multipoles are computed and available just after the P2M or M2M
+         * functions
+         * @tparam Cell
+         * @param current_cell the current cell containing the multipoles to preprocess
+         * @param thread_id the thread_id needed for the fft
+         */
         template<typename Cell>
         inline auto apply_multipoles_preprocessing_impl(Cell& current_cell,
                                                         [[maybe_unused]] size_type thread_id = 0) const -> void
@@ -507,6 +514,20 @@ namespace scalfmm::interpolation
         }
 
         // m2l operator impl
+        /**
+         * @brief m2l function specialized for the FFT optimization
+         *
+         * @tparam Cell
+         * @tparam ArrayScaleFactor
+         * @param source_cell
+         * @param target_cell
+         * @param products the buffer to store the result in the spectral space
+         * @param k the kernel matrix
+         * @param scale_factor the scaling factor
+         * @param n the nth output
+         * @param m the mth input
+         * @param thread_id the thread id
+         */
         template<typename Cell, typename ArrayScaleFactor>
         inline auto apply_m2l_impl(Cell const& source_cell, [[maybe_unused]] Cell& target_cell,
                                    [[maybe_unused]] buffer_type& products, interaction_matrix_type const& k,
@@ -514,13 +535,22 @@ namespace scalfmm::interpolation
                                    [[maybe_unused]] std::size_t m, [[maybe_unused]] size_type thread_id = 0) const
           -> void
         {
-            // get the transformed multipoles
+            // get the transformed multipoles (only available for uniform approximation)
             auto const& transformed_multipoles = source_cell.ctransformed_multipoles();
-
+            // component-wise product in spectral space
             tensor::product(products.at(n), transformed_multipoles.at(m), k.at(n, m), scale_factor.at(n));
         }
 
-        // postprocessing multipoles
+        // postprocessing locals
+        /**
+         * @brief postprocessing locals
+         *
+         * We apply the inverse fft to construct the real local arrays (for all outputs) of the current_cell
+         * @tparam Cell
+         * @param current_cell the cell containing the local array to post-process
+         * @param products the spectral coefficients of the local array
+         * @param thread_id the thread id
+         */
         template<typename Cell>
         inline auto apply_multipoles_postprocessing_impl(Cell& current_cell,
                                                          [[maybe_unused]] buffer_type const& products,
@@ -612,7 +642,7 @@ namespace scalfmm::interpolation
         using buffer_inner_type = typename storage_type::buffer_inner_type;   // xt::xarray<complex_type>;
         using buffer_shape_type = typename storage_type::buffer_shape_type;   // xt::xshape<kn>;
         using buffer_type =
-          typename storage_type::buffer_type;   // xt::xtensor_fixed<buffer_inner_type, buffer_shape_type>;
+          typename storage_type::buffer_type;   /// xt::xtensor_fixed<buffer_inner_type, buffer_shape_type>;
         using multipoles_container_type = typename storage_type::multipoles_container_type;
         using locals_container_type = typename storage_type::locals_container_type;
         using k_tensor_type = buffer_inner_type;   // xt::xarray<complex_type>;
diff --git a/include/scalfmm/utils/tensor.hpp b/include/scalfmm/utils/tensor.hpp
index 42bae47433549c16975925b6dd2d525bda148265..e9713b9fa19eaa67337f27f75ec16d5035cbee9d 100644
--- a/include/scalfmm/utils/tensor.hpp
+++ b/include/scalfmm/utils/tensor.hpp
@@ -289,21 +289,24 @@ namespace scalfmm::tensor
         const std::size_t vec_size = size - size % inc;
         ValueType scale_cp(scalar);
         simd_type splat(scale_cp);
-        for(std::size_t i{0}; i<vec_size; i+=inc)
+        auto ptr_t = t.data();
+        auto ptr_k = k.data();
+        auto ptr_acc = accumulate.data();
+        for(std::size_t i{0}; i < vec_size; i += inc)
         {
-            const auto t_ = xsimd::load_aligned(&t.data()[i]);
-            const auto k_ = xsimd::load_aligned(&k.data()[i]);
-            auto acc_ = xsimd::load_aligned(&accumulate.data()[i]);
+            const auto t_ = xsimd::load_aligned(&ptr_t[i]);
+            const auto k_ = xsimd::load_aligned(&ptr_k[i]);
+            auto acc_ = xsimd::load_aligned(&ptr_acc[i]);
             //auto times = k_*splat;
             //auto tmp = xsimd::fma(t_, times, acc_);
             auto tmp1 = simd_complex_prod(t_, k_);
             auto tmp2 = simd_complex_prod(tmp1, splat);
             acc_ += tmp2;
-            acc_.store_aligned(&accumulate.data()[i]);
+            acc_.store_aligned(&ptr_acc[i]);
         }
         for(std::size_t i{vec_size}; i<size; ++i)
         {
-            accumulate.data()[i] += t.data()[i]*k.data()[i]*scale_cp;
+            ptr_acc[i] += ptr_t[i] * ptr_k[i] * scale_cp;
             //auto tmp = accumulate.data()[i];
             //accumulate.data()[i] = xsimd::fma(t.data()[i],k.data()[i]*scale_cp, tmp);
         }