diff --git a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp index de2f602f8847e0f27a903abed5e2fe4c6ea6ebae..b8ebcef20e8a637c5794f6734a3a9e0fb745dbe6 100644 --- a/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskDepAlgorithm.hpp @@ -34,14 +34,14 @@ #ifdef OPENMP_SUPPORT_PRIORITY #define priority_if_supported(x) priority(x) enum FGroupTaskDepAlgorithm_Priorities{ - FGroupTaskDepAlgorithm_Prio_P2M = 5, - FGroupTaskDepAlgorithm_Prio_M2M = 4, - FGroupTaskDepAlgorithm_Prio_M2L_High = 3, - FGroupTaskDepAlgorithm_Prio_L2L = 2, - FGroupTaskDepAlgorithm_Prio_P2P_Big = 1, - FGroupTaskDepAlgorithm_Prio_M2L = 0, - FGroupTaskDepAlgorithm_Prio_L2P = -1, - FGroupTaskDepAlgorithm_Prio_P2P_Small = -2 + FGroupTaskDepAlgorithm_Prio_P2M = 9, + FGroupTaskDepAlgorithm_Prio_M2M = 8, + FGroupTaskDepAlgorithm_Prio_M2L_High = 7, + FGroupTaskDepAlgorithm_Prio_L2L = 6, + FGroupTaskDepAlgorithm_Prio_P2P_Big = 5, + FGroupTaskDepAlgorithm_Prio_M2L = 4, + FGroupTaskDepAlgorithm_Prio_L2P = 3, + FGroupTaskDepAlgorithm_Prio_P2P_Small = 2 }; #else #define priority_if_supported(x) @@ -65,6 +65,7 @@ protected: const int MaxThreads; //< The number of threads OctreeClass*const tree; //< The Tree KernelClass** kernels; //< The kernels + const bool noCommuteAtLastLevel; #ifdef SCALFMM_TIME_OMPTASKS FTaskTimer taskTimeRecorder; @@ -72,7 +73,8 @@ protected: public: FGroupTaskDepAlgorithm(OctreeClass*const inTree, KernelClass* inKernels, const int inMaxThreads = -1) - : MaxThreads(inMaxThreads==-1?omp_get_max_threads():inMaxThreads), tree(inTree), kernels(nullptr) + : MaxThreads(inMaxThreads==-1?omp_get_max_threads():inMaxThreads), tree(inTree), kernels(nullptr), + noCommuteAtLastLevel(getenv("SCALFMM_NO_COMMUTE_LAST_L2L") != NULL && getenv("SCALFMM_NO_COMMUTE_LAST_L2L")[0] != '0'?true:false) #ifdef SCALFMM_TIME_OMPTASKS , taskTimeRecorder(MaxThreads) #endif @@ -103,6 +105,7 @@ public: taskTimeRecorder.init(omp_get_thread_num()); } #endif + FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_LAST_L2L " << noCommuteAtLastLevel << "\n"); } ~FGroupTaskDepAlgorithm(){ @@ -627,45 +630,90 @@ protected: subCellGroup = (*iterChildCells); subCellLocalGroupsLocal = (*iterChildCells)->getRawLocalBuffer(); - #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(FGroupTaskDepAlgorithm_Prio_L2L) - { - KernelClass*const kernel = kernels[omp_get_thread_num()]; + if(noCommuteAtLastLevel == false || idxLevel != FAbstractAlgorithm::lowerWorkingLevel - 2){ + #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(commute_if_supported: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(FGroupTaskDepAlgorithm_Prio_L2L) + { + KernelClass*const kernel = kernels[omp_get_thread_num()]; - const MortonIndex firstParent = FMath::Max(currentCells->getStartingIndex(), subCellGroup->getStartingIndex()>>3); - const MortonIndex lastParent = FMath::Min(currentCells->getEndingIndex()-1, (subCellGroup->getEndingIndex()-1)>>3); - FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((lastParent * 20) + idxLevel) * 8 + 4, "L2L")); + const MortonIndex firstParent = FMath::Max(currentCells->getStartingIndex(), subCellGroup->getStartingIndex()>>3); + const MortonIndex lastParent = FMath::Min(currentCells->getEndingIndex()-1, (subCellGroup->getEndingIndex()-1)>>3); + FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((lastParent * 20) + idxLevel) * 8 + 4, "L2L")); - int idxParentCell = currentCells->getCellIndex(firstParent); - FAssertLF(idxParentCell != -1); + int idxParentCell = currentCells->getCellIndex(firstParent); + FAssertLF(idxParentCell != -1); - int idxChildCell = subCellGroup->getFistChildIdx(firstParent); - FAssertLF(idxChildCell != -1); - CellClass childData[8]; + int idxChildCell = subCellGroup->getFistChildIdx(firstParent); + FAssertLF(idxChildCell != -1); + CellClass childData[8]; - while(true){ - CellClass cell = currentCells->getDownCell(idxParentCell); - FAssertLF(cell.getMortonIndex() == currentCells->getCellMortonIndex(idxParentCell)); - CellClass* child[8] = {nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr}; + while(true){ + CellClass cell = currentCells->getDownCell(idxParentCell); + FAssertLF(cell.getMortonIndex() == currentCells->getCellMortonIndex(idxParentCell)); + CellClass* child[8] = {nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr}; - FAssertLF(cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); + FAssertLF(cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); - do{ - const int idxChild = ((subCellGroup->getCellMortonIndex(idxChildCell)) & 7); - FAssertLF(child[idxChild] == nullptr); - childData[idxChild] = subCellGroup->getDownCell(idxChildCell); - FAssertLF(subCellGroup->getCellMortonIndex(idxChildCell) == childData[idxChild].getMortonIndex()); - child[idxChild] = &childData[idxChild]; + do{ + const int idxChild = ((subCellGroup->getCellMortonIndex(idxChildCell)) & 7); + FAssertLF(child[idxChild] == nullptr); + childData[idxChild] = subCellGroup->getDownCell(idxChildCell); + FAssertLF(subCellGroup->getCellMortonIndex(idxChildCell) == childData[idxChild].getMortonIndex()); + child[idxChild] = &childData[idxChild]; - idxChildCell += 1; - }while(idxChildCell != subCellGroup->getNumberOfCellsInBlock() && cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); + idxChildCell += 1; + }while(idxChildCell != subCellGroup->getNumberOfCellsInBlock() && cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); - kernel->L2L(&cell, child, idxLevel); + kernel->L2L(&cell, child, idxLevel); - if(currentCells->getCellMortonIndex(idxParentCell) == lastParent){ - break; + if(currentCells->getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; } + } + } + else{ + #pragma omp task default(none) firstprivate(idxLevel, currentCells, cellLocals, subCellGroup, subCellLocalGroupsLocal) depend(inout: subCellLocalGroupsLocal[0]) depend(in: cellLocals[0]) priority_if_supported(FGroupTaskDepAlgorithm_Prio_L2L) + { + KernelClass*const kernel = kernels[omp_get_thread_num()]; - idxParentCell += 1; + const MortonIndex firstParent = FMath::Max(currentCells->getStartingIndex(), subCellGroup->getStartingIndex()>>3); + const MortonIndex lastParent = FMath::Min(currentCells->getEndingIndex()-1, (subCellGroup->getEndingIndex()-1)>>3); + FTIME_TASKS(FTaskTimer::ScopeEvent taskTime(omp_get_thread_num(), &taskTimeRecorder, ((lastParent * 20) + idxLevel) * 8 + 4, "L2L")); + + int idxParentCell = currentCells->getCellIndex(firstParent); + FAssertLF(idxParentCell != -1); + + int idxChildCell = subCellGroup->getFistChildIdx(firstParent); + FAssertLF(idxChildCell != -1); + CellClass childData[8]; + + while(true){ + CellClass cell = currentCells->getDownCell(idxParentCell); + FAssertLF(cell.getMortonIndex() == currentCells->getCellMortonIndex(idxParentCell)); + CellClass* child[8] = {nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr,nullptr}; + + FAssertLF(cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); + + do{ + const int idxChild = ((subCellGroup->getCellMortonIndex(idxChildCell)) & 7); + FAssertLF(child[idxChild] == nullptr); + childData[idxChild] = subCellGroup->getDownCell(idxChildCell); + FAssertLF(subCellGroup->getCellMortonIndex(idxChildCell) == childData[idxChild].getMortonIndex()); + child[idxChild] = &childData[idxChild]; + + idxChildCell += 1; + }while(idxChildCell != subCellGroup->getNumberOfCellsInBlock() && cell.getMortonIndex() == (subCellGroup->getCellMortonIndex(idxChildCell)>>3)); + + kernel->L2L(&cell, child, idxLevel); + + if(currentCells->getCellMortonIndex(idxParentCell) == lastParent){ + break; + } + + idxParentCell += 1; + } } } diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp index e252ffe097418c7429fba9a5d22944a539b2caab..721d720686ce6e0271ecd506659979e27d471f85 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp @@ -109,6 +109,8 @@ protected: starpu_codelet p2p_redux_read; #endif + const bool noCommuteAtLastLevel; + #ifdef STARPU_USE_CPU StarPUCpuWrapperClass cpuWrapper; #endif @@ -140,7 +142,8 @@ protected: public: FGroupTaskStarPUAlgorithm(OctreeClass*const inTree, KernelClass* inKernels) : tree(inTree), originalCpuKernel(inKernels), - cellHandles(nullptr), + cellHandles(nullptr), + noCommuteAtLastLevel(getenv("SCALFMM_NO_COMMUTE_LAST_L2L") != NULL && getenv("SCALFMM_NO_COMMUTE_LAST_L2L")[0] != '0'?true:false), #ifdef STARPU_USE_CPU cpuWrapper(tree->getHeight()), #endif @@ -212,6 +215,7 @@ public: #ifdef SCALFMM_ENABLE_CUDA_KERNEL FLOG(FLog::Controller << "FGroupTaskStarPUAlgorithm (Max CUDA " << starpu_cuda_worker_get_count() << ")\n"); #endif + FLOG(FLog::Controller << "SCALFMM_NO_COMMUTE_LAST_L2L " << noCommuteAtLastLevel << "\n"); buildTaskNames(); } @@ -1118,7 +1122,12 @@ protected: task->dyn_handles[3] = cellHandles[idxLevel+1][idxSubGroup].down; // put the right codelet - task->cl = (idxLevel == FAbstractAlgorithm::lowerWorkingLevel - 2 ? &l2l_cl_nocommute : &l2l_cl); + if(noCommuteAtLastLevel){ + task->cl = (idxLevel == FAbstractAlgorithm::lowerWorkingLevel - 2 ? &l2l_cl_nocommute : &l2l_cl); + } + else{ + task->cl = &l2l_cl; + } // put args values char *arg_buffer; size_t arg_buffer_size; @@ -1149,7 +1158,12 @@ protected: task->dyn_handles[3] = cellHandles[idxLevel+1][idxSubGroup].down; // put the right codelet - task->cl = (idxLevel == FAbstractAlgorithm::lowerWorkingLevel - 2 ? &l2l_cl_nocommute : &l2l_cl); + if(noCommuteAtLastLevel){ + task->cl = (idxLevel == FAbstractAlgorithm::lowerWorkingLevel - 2 ? &l2l_cl_nocommute : &l2l_cl); + } + else{ + task->cl = &l2l_cl; + } // put args values char *arg_buffer; size_t arg_buffer_size;