diff --git a/bench/fmm_generic.hpp b/bench/fmm_generic.hpp index 5dafcffcae69dc6ee0911ec9c984b9b552dd3c7d..75a9a31d32642ebff9fca9c5095d50e8e65d41b4 100644 --- a/bench/fmm_generic.hpp +++ b/bench/fmm_generic.hpp @@ -380,9 +380,9 @@ auto main([[maybe_unused]] int argc, [[maybe_unused]] char* argv[]) -> int if(which_interp == 0) { - using options_uniform = scalfmm::options::uniform_<scalfmm::options::dense_>; + using options_uniform = scalfmm::options::uniform_<scalfmm::options::fft_>; - std::cout << cpp_tools::colors::blue << "<params> Interpolator: Uniform low rank" << cpp_tools::colors::reset + std::cout << cpp_tools::colors::blue << "<params> Interpolator: Uniform fft" << cpp_tools::colors::reset << '\n'; if(mutual) { diff --git a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp index 1b4a9e8aa285e0b4d3666117e626b6b226861ef3..104fb34a7205e7c7a8884c4c8c234f8cc86698e9 100644 --- a/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp +++ b/include/scalfmm/interpolation/uniform/uniform_interpolator.hpp @@ -479,8 +479,15 @@ namespace scalfmm::interpolation } } - // Preprocessing function for applying ffts on multipoles - // This function is called as soon as the multipoles are computed and available + /** + * @brief Preprocessing function for applying ffts on multipole arrays + * + * This function is called as soon as the multipoles are computed and available just after the P2M or M2M + * functions + * @tparam Cell + * @param current_cell the current cell containing the multipoles to preprocess + * @param thread_id the thread_id needed for the fft + */ template<typename Cell> inline auto apply_multipoles_preprocessing_impl(Cell& current_cell, [[maybe_unused]] size_type thread_id = 0) const -> void @@ -507,6 +514,20 @@ namespace scalfmm::interpolation } // m2l operator impl + /** + * @brief m2l function specialized for the FFT optimization + * + * @tparam Cell + * @tparam ArrayScaleFactor + * @param source_cell + * @param target_cell + * @param products the buffer to store the result in the spectral space + * @param k the kernel matrix + * @param scale_factor the scaling factor + * @param n the nth output + * @param m the mth input + * @param thread_id the thread id + */ template<typename Cell, typename ArrayScaleFactor> inline auto apply_m2l_impl(Cell const& source_cell, [[maybe_unused]] Cell& target_cell, [[maybe_unused]] buffer_type& products, interaction_matrix_type const& k, @@ -514,13 +535,22 @@ namespace scalfmm::interpolation [[maybe_unused]] std::size_t m, [[maybe_unused]] size_type thread_id = 0) const -> void { - // get the transformed multipoles + // get the transformed multipoles (only available for uniform approximation) auto const& transformed_multipoles = source_cell.ctransformed_multipoles(); - + // component-wise product in spectral space tensor::product(products.at(n), transformed_multipoles.at(m), k.at(n, m), scale_factor.at(n)); } - // postprocessing multipoles + // postprocessing locals + /** + * @brief postprocessing locals + * + * We apply the inverse fft to construct the real local arrays (for all outputs) of the current_cell + * @tparam Cell + * @param current_cell the cell containing the local array to post-process + * @param products the spectral coefficients of the local array + * @param thread_id the thread id + */ template<typename Cell> inline auto apply_multipoles_postprocessing_impl(Cell& current_cell, [[maybe_unused]] buffer_type const& products, @@ -612,7 +642,7 @@ namespace scalfmm::interpolation using buffer_inner_type = typename storage_type::buffer_inner_type; // xt::xarray<complex_type>; using buffer_shape_type = typename storage_type::buffer_shape_type; // xt::xshape<kn>; using buffer_type = - typename storage_type::buffer_type; // xt::xtensor_fixed<buffer_inner_type, buffer_shape_type>; + typename storage_type::buffer_type; /// xt::xtensor_fixed<buffer_inner_type, buffer_shape_type>; using multipoles_container_type = typename storage_type::multipoles_container_type; using locals_container_type = typename storage_type::locals_container_type; using k_tensor_type = buffer_inner_type; // xt::xarray<complex_type>; diff --git a/include/scalfmm/utils/tensor.hpp b/include/scalfmm/utils/tensor.hpp index 42bae47433549c16975925b6dd2d525bda148265..e9713b9fa19eaa67337f27f75ec16d5035cbee9d 100644 --- a/include/scalfmm/utils/tensor.hpp +++ b/include/scalfmm/utils/tensor.hpp @@ -289,21 +289,24 @@ namespace scalfmm::tensor const std::size_t vec_size = size - size % inc; ValueType scale_cp(scalar); simd_type splat(scale_cp); - for(std::size_t i{0}; i<vec_size; i+=inc) + auto ptr_t = t.data(); + auto ptr_k = k.data(); + auto ptr_acc = accumulate.data(); + for(std::size_t i{0}; i < vec_size; i += inc) { - const auto t_ = xsimd::load_aligned(&t.data()[i]); - const auto k_ = xsimd::load_aligned(&k.data()[i]); - auto acc_ = xsimd::load_aligned(&accumulate.data()[i]); + const auto t_ = xsimd::load_aligned(&ptr_t[i]); + const auto k_ = xsimd::load_aligned(&ptr_k[i]); + auto acc_ = xsimd::load_aligned(&ptr_acc[i]); //auto times = k_*splat; //auto tmp = xsimd::fma(t_, times, acc_); auto tmp1 = simd_complex_prod(t_, k_); auto tmp2 = simd_complex_prod(tmp1, splat); acc_ += tmp2; - acc_.store_aligned(&accumulate.data()[i]); + acc_.store_aligned(&ptr_acc[i]); } for(std::size_t i{vec_size}; i<size; ++i) { - accumulate.data()[i] += t.data()[i]*k.data()[i]*scale_cp; + ptr_acc[i] += ptr_t[i] * ptr_k[i] * scale_cp; //auto tmp = accumulate.data()[i]; //accumulate.data()[i] = xsimd::fma(t.data()[i],k.data()[i]*scale_cp, tmp); }