diff --git a/Src/Adaptive/new/FAdaptiveTask.hpp b/Src/Adaptive/new/FAdaptiveTask.hpp new file mode 100644 index 0000000000000000000000000000000000000000..08fd91f222c3102c550610eac13b59c043e1bc72 --- /dev/null +++ b/Src/Adaptive/new/FAdaptiveTask.hpp @@ -0,0 +1,936 @@ +#ifndef SCALFMM_ADAPTIVE_TASK_ALGO_HPP_ +#define SCALFMM_ADAPTIVE_TASK_ALGO_HPP_ + +#include <algorithm> +#include <numeric> +#include <cmath> // Used to round box differences +#include <functional> +#include <map> +#include <list> +#include <array> +#include <type_traits> +#include <memory> +#include <sstream> + +#include <omp.h> + +#include <unistd.h> + +#include "Core/FCoreCommon.hpp" +#include "Containers/FTreeCoordinate.hpp" + +#include "FTimer.hpp" + +template<class _Tree, class _Kernel> +class FAdaptiveTask { +public: + using tree_t = _Tree; + using kernel_t = _Kernel; + using FReal = typename tree_t::FReal; + + using node_t = typename tree_t::node_t; + +protected: + tree_t& _tree; + //kernel_t& _kernel; + + std::vector<std::unique_ptr<kernel_t>> _kernels; + + FTimer timer; + + template<typename K, typename Ret, typename... Args> + using has_setup = typename std::enable_if< + std::is_same<Ret, decltype(std::declval<K>().setup(std::declval<Args>()...))>::value + >::type*; + + + template<typename K, has_setup<K, void> = nullptr> + void setup_kernel() { + + for(auto& kernel : this->_kernels) { + kernel->setup(_tree); + } + } + + void setup_kernel(...) {} + + struct mock_dependency { + using mock_node = typename std::aligned_storage<sizeof(typename tree_t::node_t)>::type; + + constexpr static const std::size_t bucket_size = 512; + std::list<std::array<mock_node, bucket_size>> pool; + std::size_t mock_dependency_index = bucket_size; + + node_t* next() { + return (node_t*) (&(this->pool.back()[mock_dependency_index++])); + } + } mock_dep; + +public: + + FAdaptiveTask(tree_t& tree, kernel_t& kernel) : + _tree(tree) + { + // Place kernel objects near their threads in memory to avoid NUMA + // latency + #pragma omp parallel + { + for(int i = 0; i < omp_get_num_threads(); ++i) { + if(i == omp_get_thread_num()) { + this->_kernels.emplace_back(new kernel_t(kernel)); + } + #pragma omp barrier + } + } + } + + FAdaptiveTask(tree_t& tree, kernel_t* kernel) : + FAdaptiveTask(tree, *kernel) + {} + + FAdaptiveTask(tree_t* tree, kernel_t& kernel) : + FAdaptiveTask(*tree, kernel) + {} + + FAdaptiveTask(tree_t* tree, kernel_t* kernel) : + FAdaptiveTask(*tree, *kernel) + {} + + ~FAdaptiveTask() { + + } + + void execute(int operations = FFmmNearAndFarFields) { + this->run(operations); + } + + void run(int operations = FFmmNearAndFarFields) { + + this->setup_kernel(); + + #pragma omp parallel + { + #pragma omp single nowait + { + std::cout << "Task adaptive algorithm (" << omp_get_num_threads() << " threads)" << std::endl; + std::cout << "Master thread: " << omp_get_thread_num() << std::endl; + + if(operations & FFmmP2P) { + // A. U-list, P2P + auto ULS = std::bind(&FAdaptiveTask::u_list_step, this); + timer.time(ULS); + std::cout << " P2P: " << timer.last().count() << "s" << std::endl; + } + + if(operations & FFmmP2M) { + // 1. source to up, P2M + auto S2U = std::bind(&FAdaptiveTask::source_to_up, this); + timer.time(S2U); + std::cout << " P2M: " << timer.last().count() << "s" << std::endl; + } + + if(operations & FFmmM2M) { + // 2. up to up, M2M + auto U2U = std::bind(&FAdaptiveTask::up_to_up, this); + timer.time(U2U); + std::cout << " M2M: " << timer.last().count() << "s" << '\n'; + } + + if(operations & FFmmM2L) { + // 3a V-list, M2L + auto VLS = std::bind(&FAdaptiveTask::v_list_step, this); + timer.time(VLS); + std::cout << " M2L: " << timer.last().count() << "s" << '\n'; + } + + if(operations & FFmmP2L) { + // 3b X-list, P2L + auto XLS = std::bind(&FAdaptiveTask::x_list_step, this); + timer.time(XLS); + std::cout << " P2L: " << timer.last().count() << "s" << '\n'; + } + + if(operations & FFmmL2L) { + // 4. down to down, L2L + auto D2D = std::bind(&FAdaptiveTask::down_to_down, this); + timer.time(D2D); + std::cout << " L2L: " << timer.last().count() << "s" << '\n'; + } + + if(operations & FFmmM2P) { + // 5a W-list, M2P + auto WLS = std::bind(&FAdaptiveTask::w_list_step, this); + timer.time(WLS); + std::cout << " M2P: " << timer.last().count() << "s" << '\n'; + } + + if(operations & FFmmL2P) { + // 5b down to target, L2P + auto D2T = std::bind(&FAdaptiveTask::down_to_target, this); + timer.time(D2T); + std::cout << " L2P: " << timer.last().count() << "s" << '\n'; + } + + std::cout << " task created: " + << std::accumulate(this->timer.measures().begin(), + this->timer.measures().end(), + std::chrono::seconds(0)).count() + << '\n'; + } + } + } + + // P2M + void source_to_up() { + + for(node_t* leaf : _tree.leaves()) { + #pragma omp task firstprivate(leaf) depend(inout: leaf[:1]) + { + const int thread_num = omp_get_thread_num(); + _kernels[thread_num]->P2M(leaf->getData(), leaf->getParticleContainer()); + } + } + + } + + // M2M + void up_to_up() { + + for(node_t& n : _tree.post_order_walk()) { + node_t* node = &n; + if(! node->is_leaf()) { + + node_t* children[node_t::child_count] = {}; + for(node_t* child : node->getChildren()) { + children[child->getIndex() & (node_t::child_count-1)] = child; + } + + #pragma omp task \ + depend(in: \ + children[0][:1], \ + children[1][:1], \ + children[2][:1], \ + children[3][:1], \ + children[4][:1], \ + children[5][:1], \ + children[6][:1], \ + children[7][:1]) \ + depend(out: node[:1]) + { + const int thread_num = omp_get_thread_num(); + + // Fill array of children data + std::array<typename node_t::data_t*, node_t::child_count> child_data; + for(node_t* child : node->getChildren()) { + + /// Last bits of child index give the index relative to parent + child_data[child->getIndex() & (node_t::child_count-1)] = child->getData(); + } + + // Call kernel module + this->_kernels[thread_num]->M2M(node->getData(), child_data.data(), + static_cast<int>(node->getDepth())); + } + } + } + // #pragma omp taskwait + } + + // M2L + void v_list_step() { + + for(node_t& n : _tree.in_order_walk()) { + node_t* node = &n; + if(node->is_leaf() && node->getParticleContainer()->size() == 0) { + continue; + } + + // Generate task dependencies + + // The array of dependencies, we know that there cannot be more than + // 7^Dim cells involved in a M2L, in 3D, this is 343 + node_t* task_deps[343]; + std::size_t idx_dep = 0; + // Add existing dependencies + for(node_t* v_item : node->V) { + if(v_item->is_leaf() + && v_item->getParticleContainer()->size() == 0) { + continue; + } + task_deps[idx_dep] = v_item; + ++idx_dep; + } + // Add mock dependencies, these are generated on the fly and used + // only once, that way they can never stop a task from starting + while(idx_dep < 343) { + task_deps[idx_dep] = this->mock_dep.next(); + ++idx_dep; + } + + #pragma omp task \ + depend(in: \ + task_deps[0][:1], \ + task_deps[1][:1], \ + task_deps[2][:1], \ + task_deps[3][:1], \ + task_deps[4][:1], \ + task_deps[5][:1], \ + task_deps[6][:1], \ + task_deps[7][:1], \ + task_deps[8][:1], \ + task_deps[9][:1], \ + task_deps[10][:1], \ + task_deps[11][:1], \ + task_deps[12][:1], \ + task_deps[13][:1], \ + task_deps[14][:1], \ + task_deps[15][:1], \ + task_deps[16][:1], \ + task_deps[17][:1], \ + task_deps[18][:1], \ + task_deps[19][:1], \ + task_deps[20][:1], \ + task_deps[21][:1], \ + task_deps[22][:1], \ + task_deps[23][:1], \ + task_deps[24][:1], \ + task_deps[25][:1], \ + task_deps[26][:1], \ + task_deps[27][:1], \ + task_deps[28][:1], \ + task_deps[29][:1], \ + task_deps[30][:1], \ + task_deps[31][:1], \ + task_deps[32][:1], \ + task_deps[33][:1], \ + task_deps[34][:1], \ + task_deps[35][:1], \ + task_deps[36][:1], \ + task_deps[37][:1], \ + task_deps[38][:1], \ + task_deps[39][:1], \ + task_deps[40][:1], \ + task_deps[41][:1], \ + task_deps[42][:1], \ + task_deps[43][:1], \ + task_deps[44][:1], \ + task_deps[45][:1], \ + task_deps[46][:1], \ + task_deps[47][:1], \ + task_deps[48][:1], \ + task_deps[49][:1], \ + task_deps[50][:1], \ + task_deps[51][:1], \ + task_deps[52][:1], \ + task_deps[53][:1], \ + task_deps[54][:1], \ + task_deps[55][:1], \ + task_deps[56][:1], \ + task_deps[57][:1], \ + task_deps[58][:1], \ + task_deps[59][:1], \ + task_deps[60][:1], \ + task_deps[61][:1], \ + task_deps[62][:1], \ + task_deps[63][:1], \ + task_deps[64][:1], \ + task_deps[65][:1], \ + task_deps[66][:1], \ + task_deps[67][:1], \ + task_deps[68][:1], \ + task_deps[69][:1], \ + task_deps[70][:1], \ + task_deps[71][:1], \ + task_deps[72][:1], \ + task_deps[73][:1], \ + task_deps[74][:1], \ + task_deps[75][:1], \ + task_deps[76][:1], \ + task_deps[77][:1], \ + task_deps[78][:1], \ + task_deps[79][:1], \ + task_deps[80][:1], \ + task_deps[81][:1], \ + task_deps[82][:1], \ + task_deps[83][:1], \ + task_deps[84][:1], \ + task_deps[85][:1], \ + task_deps[86][:1], \ + task_deps[87][:1], \ + task_deps[88][:1], \ + task_deps[89][:1], \ + task_deps[90][:1], \ + task_deps[91][:1], \ + task_deps[92][:1], \ + task_deps[93][:1], \ + task_deps[94][:1], \ + task_deps[95][:1], \ + task_deps[96][:1], \ + task_deps[97][:1], \ + task_deps[98][:1], \ + task_deps[99][:1], \ + task_deps[100][:1], \ + task_deps[101][:1], \ + task_deps[102][:1], \ + task_deps[103][:1], \ + task_deps[104][:1], \ + task_deps[105][:1], \ + task_deps[106][:1], \ + task_deps[107][:1], \ + task_deps[108][:1], \ + task_deps[109][:1], \ + task_deps[110][:1], \ + task_deps[111][:1], \ + task_deps[112][:1], \ + task_deps[113][:1], \ + task_deps[114][:1], \ + task_deps[115][:1], \ + task_deps[116][:1], \ + task_deps[117][:1], \ + task_deps[118][:1], \ + task_deps[119][:1], \ + task_deps[120][:1], \ + task_deps[121][:1], \ + task_deps[122][:1], \ + task_deps[123][:1], \ + task_deps[124][:1], \ + task_deps[125][:1], \ + task_deps[126][:1], \ + task_deps[127][:1], \ + task_deps[128][:1], \ + task_deps[129][:1], \ + task_deps[130][:1], \ + task_deps[131][:1], \ + task_deps[132][:1], \ + task_deps[133][:1], \ + task_deps[134][:1], \ + task_deps[135][:1], \ + task_deps[136][:1], \ + task_deps[137][:1], \ + task_deps[138][:1], \ + task_deps[139][:1], \ + task_deps[140][:1], \ + task_deps[141][:1], \ + task_deps[142][:1], \ + task_deps[143][:1], \ + task_deps[144][:1], \ + task_deps[145][:1], \ + task_deps[146][:1], \ + task_deps[147][:1], \ + task_deps[148][:1], \ + task_deps[149][:1], \ + task_deps[150][:1], \ + task_deps[151][:1], \ + task_deps[152][:1], \ + task_deps[153][:1], \ + task_deps[154][:1], \ + task_deps[155][:1], \ + task_deps[156][:1], \ + task_deps[157][:1], \ + task_deps[158][:1], \ + task_deps[159][:1], \ + task_deps[160][:1], \ + task_deps[161][:1], \ + task_deps[162][:1], \ + task_deps[163][:1], \ + task_deps[164][:1], \ + task_deps[165][:1], \ + task_deps[166][:1], \ + task_deps[167][:1], \ + task_deps[168][:1], \ + task_deps[169][:1], \ + task_deps[170][:1], \ + task_deps[171][:1], \ + task_deps[172][:1], \ + task_deps[173][:1], \ + task_deps[174][:1], \ + task_deps[175][:1], \ + task_deps[176][:1], \ + task_deps[177][:1], \ + task_deps[178][:1], \ + task_deps[179][:1], \ + task_deps[180][:1], \ + task_deps[181][:1], \ + task_deps[182][:1], \ + task_deps[183][:1], \ + task_deps[184][:1], \ + task_deps[185][:1], \ + task_deps[186][:1], \ + task_deps[187][:1], \ + task_deps[188][:1], \ + task_deps[189][:1], \ + task_deps[190][:1], \ + task_deps[191][:1], \ + task_deps[192][:1], \ + task_deps[193][:1], \ + task_deps[194][:1], \ + task_deps[195][:1], \ + task_deps[196][:1], \ + task_deps[197][:1], \ + task_deps[198][:1], \ + task_deps[199][:1], \ + task_deps[200][:1], \ + task_deps[201][:1], \ + task_deps[202][:1], \ + task_deps[203][:1], \ + task_deps[204][:1], \ + task_deps[205][:1], \ + task_deps[206][:1], \ + task_deps[207][:1], \ + task_deps[208][:1], \ + task_deps[209][:1], \ + task_deps[210][:1], \ + task_deps[211][:1], \ + task_deps[212][:1], \ + task_deps[213][:1], \ + task_deps[214][:1], \ + task_deps[215][:1], \ + task_deps[216][:1], \ + task_deps[217][:1], \ + task_deps[218][:1], \ + task_deps[219][:1], \ + task_deps[220][:1], \ + task_deps[221][:1], \ + task_deps[222][:1], \ + task_deps[223][:1], \ + task_deps[224][:1], \ + task_deps[225][:1], \ + task_deps[226][:1], \ + task_deps[227][:1], \ + task_deps[228][:1], \ + task_deps[229][:1], \ + task_deps[230][:1], \ + task_deps[231][:1], \ + task_deps[232][:1], \ + task_deps[233][:1], \ + task_deps[234][:1], \ + task_deps[235][:1], \ + task_deps[236][:1], \ + task_deps[237][:1], \ + task_deps[238][:1], \ + task_deps[239][:1], \ + task_deps[240][:1], \ + task_deps[241][:1], \ + task_deps[242][:1], \ + task_deps[243][:1], \ + task_deps[244][:1], \ + task_deps[245][:1], \ + task_deps[246][:1], \ + task_deps[247][:1], \ + task_deps[248][:1], \ + task_deps[249][:1], \ + task_deps[250][:1], \ + task_deps[251][:1], \ + task_deps[252][:1], \ + task_deps[253][:1], \ + task_deps[254][:1], \ + task_deps[255][:1], \ + task_deps[256][:1], \ + task_deps[257][:1], \ + task_deps[258][:1], \ + task_deps[259][:1], \ + task_deps[260][:1], \ + task_deps[261][:1], \ + task_deps[262][:1], \ + task_deps[263][:1], \ + task_deps[264][:1], \ + task_deps[265][:1], \ + task_deps[266][:1], \ + task_deps[267][:1], \ + task_deps[268][:1], \ + task_deps[269][:1], \ + task_deps[270][:1], \ + task_deps[271][:1], \ + task_deps[272][:1], \ + task_deps[273][:1], \ + task_deps[274][:1], \ + task_deps[275][:1], \ + task_deps[276][:1], \ + task_deps[277][:1], \ + task_deps[278][:1], \ + task_deps[279][:1], \ + task_deps[280][:1], \ + task_deps[281][:1], \ + task_deps[282][:1], \ + task_deps[283][:1], \ + task_deps[284][:1], \ + task_deps[285][:1], \ + task_deps[286][:1], \ + task_deps[287][:1], \ + task_deps[288][:1], \ + task_deps[289][:1], \ + task_deps[290][:1], \ + task_deps[291][:1], \ + task_deps[292][:1], \ + task_deps[293][:1], \ + task_deps[294][:1], \ + task_deps[295][:1], \ + task_deps[296][:1], \ + task_deps[297][:1], \ + task_deps[298][:1], \ + task_deps[299][:1], \ + task_deps[300][:1], \ + task_deps[301][:1], \ + task_deps[302][:1], \ + task_deps[303][:1], \ + task_deps[304][:1], \ + task_deps[305][:1], \ + task_deps[306][:1], \ + task_deps[307][:1], \ + task_deps[308][:1], \ + task_deps[309][:1], \ + task_deps[310][:1], \ + task_deps[311][:1], \ + task_deps[312][:1], \ + task_deps[313][:1], \ + task_deps[314][:1], \ + task_deps[315][:1], \ + task_deps[316][:1], \ + task_deps[317][:1], \ + task_deps[318][:1], \ + task_deps[319][:1], \ + task_deps[320][:1], \ + task_deps[321][:1], \ + task_deps[322][:1], \ + task_deps[323][:1], \ + task_deps[324][:1], \ + task_deps[325][:1], \ + task_deps[326][:1], \ + task_deps[327][:1], \ + task_deps[328][:1], \ + task_deps[329][:1], \ + task_deps[330][:1], \ + task_deps[331][:1], \ + task_deps[332][:1], \ + task_deps[333][:1], \ + task_deps[334][:1], \ + task_deps[335][:1], \ + task_deps[336][:1], \ + task_deps[337][:1], \ + task_deps[338][:1], \ + task_deps[339][:1], \ + task_deps[340][:1], \ + task_deps[341][:1], \ + task_deps[342][:1] \ + ) depend(out: node[:1]) firstprivate(node) + { + + const int thread_num = omp_get_thread_num(); + + std::vector<decltype(std::declval<const node_t>().getData())> v_item_data_list; + std::vector<int> v_item_indices; + // Needed to compute offset between boxes + for(node_t* v_item : node->V) { + if(v_item->is_leaf() + && v_item->getParticleContainer()->size() == 0) { + continue; + } + v_item_indices.push_back(compute_box_offset_index(node, v_item, 3)); + v_item_data_list.push_back(v_item->getData()); + } + + // Call kernel M2L operator + this->_kernels[thread_num]->M2L(node->getData(), v_item_data_list.data(), v_item_indices.data(), static_cast<int>(v_item_data_list.size()), static_cast<int>(node->getDepth())); + } + + } + // #pragma omp taskwait + } + + // P2L + void x_list_step() { + /* NOTE: the X list and W list are complementary: if A is in X(B) then B + * is in W(A). + * + * We loop over the leaves first to detect the empty ones early on. + */ + for(node_t* leaf : _tree.leaves()) { + if(leaf->getParticleContainer()->size() > 0) { + for(node_t* w_item : leaf->W) { + #pragma omp task depend(in: leaf[:1]) depend(out: w_item[:1]) + { + const int thread_num = omp_get_thread_num(); + this->_kernels[thread_num]->P2L(w_item->getData(), leaf->getParticleContainer()); + } + } + } + } + // #pragma omp taskwait + } + + // L2L + void down_to_down() { + for(node_t& n : _tree.pre_order_walk()) { + node_t* node = &n; + if(! node->is_leaf()) { + + node_t* children[node_t::child_count]; + for(std::size_t i = 0; i < node_t::child_count; ++i) { + children[i] = node->getChild(i); + } + + #pragma omp task \ + depend(in: node[:1]) \ + depend(inout: children[0][:1], \ + children[1][:1], \ + children[2][:1], \ + children[3][:1], \ + children[4][:1], \ + children[5][:1], \ + children[6][:1], \ + children[7][:1] \ + ) + + { + const int thread_num = omp_get_thread_num(); + + typename node_t::data_t* child_data[node_t::child_count]; + for(std::size_t i = 0; i < node_t::child_count; ++i) { + child_data[i] = node->getChild(i)->getData(); + } + this->_kernels[thread_num]->L2L(node->getData(), child_data, static_cast<int>(node->getDepth())); + } + } + } + // #pragma omp taskwait + } + + // M2P + void w_list_step() { + for(node_t* leaf : _tree.leaves()) { + if(leaf->getParticleContainer()->size() > 0) { + for(node_t* w_item : leaf->W) { + #pragma omp task depend(inout: leaf[:1]) depend(in: w_item[:1]) + { + const int thread_num = omp_get_thread_num(); + this->_kernels[thread_num]->M2P(w_item->getData(), leaf->getParticleContainer()); + } + } + } + } + // #pragma omp taskwait + } + + // L2P + void down_to_target() { + for(node_t* leaf : _tree.leaves()) { + #pragma omp task depend(inout: leaf[:1]) + { + const int thread_num = omp_get_thread_num(); + this->_kernels[thread_num]->L2P(leaf->getData(), leaf->getParticleContainer()); + } + } + // #pragma omp taskwait + } + + + /** \brief Direct computation step (P2P) + * + * For each tree leaf, a direct computation is done with the adjacent + * leaves. + * + * The symmetric computation kernels expect to receive the list of adjacent + * leaves, stored in a leaf's U list, sorted by offset index. + * + * The offset index is of a node compared to another follows the + * numerotation of the adjacent nodes, on each axis. In 3D, this woud give, + * with L the reference leaf: + * + * x x x + * 0 1 2 | 9 10 11 | 18 19 20 + * y 3 4 5 | 12 L 14 | 21 22 23 + * 6 7 8 | 15 16 17 | 24 25 26 + * z=0 z=1 z=2 + * + * When two node that are to be compared are not on the same level, the + * lowest one's ancestor that is at the highest one's level is used to + * compute the index. + * + * To statisfy this condition, the leaves have to be sorted before being + * given to the kernel P2P method. + * + */ + void u_list_step() { + using container_t = typename node_t::particle_container_t; + + // The following containers are reused for each leaf to avoid repeated + // dynamic allocation. + + for(node_t* leaf : _tree.leaves()) { + + container_t* const leaf_source_particle_container = + leaf->getParticleContainer(); + container_t* const leaf_target_particle_container = + leaf->getParticleContainer(); + + // Skip empty leaves + if( leaf_source_particle_container->size() == 0 + && leaf_target_particle_container->size() == 0) { + continue; + } + + auto it = leaf->U.begin(); + bool do_inner = true; + + while(it != leaf->U.end()) { + constexpr const std::size_t max_task_size = 27; + std::size_t i = 0; + + node_t* task_deps[max_task_size]; + + auto first = it; + + while((it != leaf->U.end()) && (i < max_task_size)) { + task_deps[i] = *it; + ++i; + ++it; + } + + while(i < max_task_size) { + task_deps[i] = this->mock_dep.next(); + ++i; + } + + + #pragma omp task \ + firstprivate(leaf, leaf_source_particle_container, leaf_target_particle_container, first, it, do_inner) \ + depend(inout: \ + leaf[:1], \ + task_deps[0][:1], \ + task_deps[1][:1], \ + task_deps[2][:1], \ + task_deps[3][:1], \ + task_deps[4][:1], \ + task_deps[5][:1], \ + task_deps[6][:1], \ + task_deps[7][:1], \ + task_deps[8][:1], \ + task_deps[9][:1], \ + task_deps[10][:1], \ + task_deps[11][:1], \ + task_deps[12][:1], \ + task_deps[13][:1], \ + task_deps[14][:1], \ + task_deps[15][:1], \ + task_deps[16][:1], \ + task_deps[17][:1], \ + task_deps[18][:1], \ + task_deps[19][:1], \ + task_deps[20][:1], \ + task_deps[21][:1], \ + task_deps[22][:1], \ + task_deps[23][:1], \ + task_deps[24][:1], \ + task_deps[25][:1], \ + task_deps[26][:1] \ + ) + { + const int thread_num = omp_get_thread_num(); + // Vectors to be filled after sort and passed to kernel P2P + std::vector<container_t*> u_item_source_particle_containers; + std::vector<int> u_item_indices; + + auto last = it; + + while(first != last) { + // Skip empty u_items + if((*first) == leaf || (*first)->getParticleContainer()->size() == 0) { + ++first; + continue; + } + u_item_source_particle_containers.push_back((*first)->getParticleContainer()); + u_item_indices.push_back(compute_box_offset_index(leaf, *first, 1)); + ++first; + } + // Call P2P on vectors data + this->_kernels[thread_num] + ->P2PTask( + do_inner, + leaf_target_particle_container, + u_item_source_particle_containers.data(), + u_item_indices.data(), + static_cast<int>(u_item_source_particle_containers.size()) + ); + } + do_inner = false; + + } + + // #pragma omp task firstprivate(leaf, leaf_source_particle_container, leaf_target_particle_container) + // { + // const int thread_num = omp_get_thread_num(); + // // Vectors to be filled after sort and passed to kernel P2P + // std::vector<container_t*> u_item_source_particle_containers; + // std::vector<int> u_item_indices; + + // for(node_t* u_item : leaf->U) { + // // Skip empty u_items + // if(u_item == leaf || u_item->getParticleContainer()->size() == 0) { + // continue; + // } + // u_item_source_particle_containers.push_back(u_item->getParticleContainer()); + // u_item_indices.push_back(compute_box_offset_index(leaf, u_item, 1)); + // } + + // // Call P2P on vectors data + // this->_kernels[thread_num] + // ->P2PTask( + // true, + // leaf_target_particle_container, + // u_item_source_particle_containers.data(), + // static_cast<int>(u_item_source_particle_containers.size()) + // ); + + // } + } + + #pragma omp taskwait + } + + + /** Compute resulting index in pre-computation array. + * + * Kernels precompute some operators according to node offsets. This method + * returns the `other_node` index in the precomputation array. + * + * The index has the following form, with d the space dimension: + * + * x+n * 7^d + y+n * 7^(d-1) + ... + z+n + * + * Because an `other_node` is at most `n` boxes away in every direction from a + * `node`, in order to get a positive index, we add `n` to the offset in every + * direction. + * + * \param node The target node + * \param other_node The interacting node + * \param n The longest possible distance between node and other_node in + * terms of boxes on an axis + * + * \warning If the nodes are not at the same level, the lowest one's + * ancestor at the highest one's level is used. + */ + int compute_box_offset_index(node_t* node, node_t* other_node, const std::size_t n) { + while(node->getDepth() > other_node->getDepth()) { + node = node->getParent(); + } + while(node->getDepth() < other_node->getDepth()) { + other_node = other_node->getParent(); + } + + typename node_t::FReal box_width = (node->getBox().c2 - node->getBox().c1)[0]; + + auto center_offset = other_node->getBox().center() - node->getBox().center(); + std::size_t other_node_index = 0; + for(std::size_t i = 0; i < node_t::Dim; ++i) { + other_node_index = other_node_index * (2 * n + 1) + + static_cast<unsigned long>(std::lround(center_offset[i] / box_width)) + + n; + } + + return static_cast<int>(other_node_index); + } + + +}; + +#endif