diff --git a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp index e00292dc8cce82c9614330268a73a11065f9a074..ff31b802f0cec54577fe51839a1f853cf72f3c91 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuAlgorithm.hpp @@ -237,6 +237,7 @@ public: void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){ FLOG( FLog::Controller << "\tStart FGroupTaskStarPUAlgorithm\n" ); + const bool directOnly = (tree->getHeight() <= 2); #pragma omp parallel #pragma omp single @@ -246,17 +247,17 @@ public: starpu_resume(); - if(operationsToProceed & FFmmP2M) bottomPass(); + if(operationsToProceed & FFmmP2M && !directOnly) bottomPass(); - if(operationsToProceed & FFmmM2M) upwardPass(); + if(operationsToProceed & FFmmM2M && !directOnly) upwardPass(); - if(operationsToProceed & FFmmM2L) transferPass(); + if(operationsToProceed & FFmmM2L && !directOnly) transferPass(); - if(operationsToProceed & FFmmL2L) downardPass(); + if(operationsToProceed & FFmmL2L && !directOnly) downardPass(); if( operationsToProceed & FFmmP2P ) directPass(); - if( operationsToProceed & FFmmL2P ) mergePass(); + if( operationsToProceed & FFmmL2P && !directOnly) mergePass(); starpu_task_wait_for_all(); starpu_pause(); diff --git a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp index 677e4960da7e4613ab52af856a245a5612eefc4b..a2d4a1ed0f49ff5666c875f451881f697f398880 100644 --- a/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp +++ b/Src/GroupTree/Core/FGroupTaskStarpuMpiAlgorithm.hpp @@ -256,6 +256,7 @@ public: void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){ FLOG( FLog::Controller << "\tStart FGroupTaskStarPUMpiAlgorithm\n" ); + const bool directOnly = (tree->getHeight() <= 2); #pragma omp parallel #pragma omp single @@ -271,20 +272,20 @@ public: if( operationsToProceed & FFmmP2P ) insertParticlesSend(); - if(operationsToProceed & FFmmP2M) bottomPass(); + if(operationsToProceed & FFmmP2M && !directOnly) bottomPass(); - if(operationsToProceed & FFmmM2M) upwardPass(); - if(operationsToProceed & FFmmM2L) insertCellsSend(); + if(operationsToProceed & FFmmM2M && !directOnly) upwardPass(); + if(operationsToProceed & FFmmM2L && !directOnly) insertCellsSend(); - if(operationsToProceed & FFmmM2L) transferPass(); - if(operationsToProceed & FFmmM2L) transferPassMpi(); + if(operationsToProceed & FFmmM2L && !directOnly) transferPass(); + if(operationsToProceed & FFmmM2L && !directOnly) transferPassMpi(); - if(operationsToProceed & FFmmL2L) downardPass(); + if(operationsToProceed & FFmmL2L && !directOnly) downardPass(); if( operationsToProceed & FFmmP2P ) directPass(); if( operationsToProceed & FFmmP2P ) directPassMpi(); - if( operationsToProceed & FFmmL2P ) mergePass(); + if( operationsToProceed & FFmmL2P && !directOnly) mergePass(); starpu_task_wait_for_all(); starpu_pause(); @@ -918,26 +919,29 @@ protected: void postRecvAllocatedBlocks(){ std::vector<MpiDependency> toRecv; FAssertLF(tree->getHeight() == int(remoteCellGroups.size())); - for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ - for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){ - if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){ - FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp); - FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex << - " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n"); - FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex << - " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n"); - - starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb, - processesBlockInfos[idxLevel][idxHandle].owner, - getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0), - comm.getComm(), 0, 0 ); - starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp, - processesBlockInfos[idxLevel][idxHandle].owner, - getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1), - comm.getComm(), 0, 0 ); - - toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner, - comm.processId(), idxLevel, idxHandle}); + const bool directOnly = (tree->getHeight() <= 2); + if(!directOnly){ + for(int idxLevel = 0 ; idxLevel < tree->getHeight() ; ++idxLevel){ + for(int idxHandle = 0 ; idxHandle < int(remoteCellGroups[idxLevel].size()) ; ++idxHandle){ + if(remoteCellGroups[idxLevel][idxHandle].ptrSymb){ + FAssertLF(remoteCellGroups[idxLevel][idxHandle].ptrUp); + FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex << + " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0) << "\n"); + FLOG(FLog::Controller << "[SMpi] " << idxLevel << " Post a recv during M2L for Idx " << processesBlockInfos[idxLevel][idxHandle].firstIndex << + " and dest is " << processesBlockInfos[idxLevel][idxHandle].owner << " tag " << getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1) << "\n"); + + starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleSymb, + processesBlockInfos[idxLevel][idxHandle].owner, + getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 0), + comm.getComm(), 0, 0 ); + starpu_mpi_irecv_detached( remoteCellGroups[idxLevel][idxHandle].handleUp, + processesBlockInfos[idxLevel][idxHandle].owner, + getTag(idxLevel,processesBlockInfos[idxLevel][idxHandle].firstIndex, 1), + comm.getComm(), 0, 0 ); + + toRecv.push_back({processesBlockInfos[idxLevel][idxHandle].owner, + comm.processId(), idxLevel, idxHandle}); + } } } } diff --git a/Src/GroupTree/StarPUUtils/FStarPUFmmPriorities.hpp b/Src/GroupTree/StarPUUtils/FStarPUFmmPriorities.hpp index 9cbffe33ee720ee813a0a4c3e06ad0ca061ca60c..e13eaf22a77d7a58e75bf0ad05fc170201064d4d 100644 --- a/Src/GroupTree/StarPUUtils/FStarPUFmmPriorities.hpp +++ b/Src/GroupTree/StarPUUtils/FStarPUFmmPriorities.hpp @@ -59,40 +59,65 @@ public: treeHeight = inTreeHeight; - int incPrio = 0; + if(inTreeHeight > 2){ + int incPrio = 0; - prioP2MSend = incPrio++; - prioP2M = incPrio++; + prioP2MSend = incPrio++; + prioP2M = incPrio++; - prioM2MSend = incPrio++; - prioM2M = incPrio++; + prioM2MSend = incPrio++; + prioM2M = incPrio++; - prioM2L = incPrio; - prioM2LExtern = incPrio; - prioM2LMpi = incPrio++; + prioM2L = incPrio; + prioM2LExtern = incPrio; + prioM2LMpi = incPrio++; - prioL2L = incPrio++; + prioL2L = incPrio++; - incPrio += (treeHeight-2)-1 // M2L is done treeHeight-2 times - +(treeHeight-3)-1; // L2L is done treeHeight-3 times + incPrio += (treeHeight-2)-1 // M2L is done treeHeight-2 times + +(treeHeight-3)-1; // L2L is done treeHeight-3 times - prioP2P = incPrio; - prioP2PExtern = incPrio; - prioP2PMpi = incPrio++; + prioP2P = incPrio; + prioP2PExtern = incPrio; + prioP2PMpi = incPrio++; - prioL2P = incPrio++; - assert(incPrio == 6 + (treeHeight-2) + (treeHeight-3)); + prioL2P = incPrio++; + assert(incPrio == 6 + (treeHeight-2) + (treeHeight-3)); + } + else{ + int incPrio = 0; + + prioP2MSend = -1; + prioP2M = -1; + + prioM2MSend = -1; + prioM2M = -1; + + prioM2L = -1; + prioM2LExtern = -1; + prioM2LMpi = -1; + + prioL2L = -1; + + prioP2P = incPrio; + prioP2PExtern = incPrio; + prioP2PMpi = incPrio++; + + prioL2P = -1; + assert(incPrio == 1); + } } void initSchedulerCallback(unsigned /*sched_ctx_id*/, struct _starpu_heteroprio_center_policy_heteroprio *heteroprio){ + const bool workOnlyOnLeaves = (treeHeight <= 2); #ifdef STARPU_USE_CPU // CPU follows the real prio { int cpuCountPrio = 0; //prioP2MSend = 0; //prioP2M = prioP2MSend+1; - if(capacities->supportP2M(FSTARPU_CPU_IDX)){ + if( !workOnlyOnLeaves && capacities->supportP2M(FSTARPU_CPU_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioP2MSend; heteroprio->buckets[prioP2MSend].valide_archs |= STARPU_CPU; @@ -101,8 +126,8 @@ public: } //prioM2MSend = prioP2M+1; //prioM2M = prioM2MSend+1; - assert(cpuCountPrio == prioM2MSend); // True if CPU support all TODO - if(capacities->supportM2M(FSTARPU_CPU_IDX)){ + //assert(cpuCountPrio == prioM2MSend); // True if CPU support all TODO + if(!workOnlyOnLeaves && capacities->supportM2M(FSTARPU_CPU_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioM2MSend; heteroprio->buckets[prioM2MSend].valide_archs |= STARPU_CPU; @@ -114,7 +139,7 @@ public: // prioM2LExtern = prioM2L; // prioM2LMpi = prioM2L; // prioL2L = prioM2L+1; - assert(cpuCountPrio == prioM2L); // True if CPU support all TODO + // assert(cpuCountPrio == prioM2L); // True if CPU support all TODO for(int idxLevel = 2 ; idxLevel < treeHeight ; ++idxLevel){ if(capacities->supportM2L(FSTARPU_CPU_IDX)){ const int prioM2LAtLevel = getPrioM2L(idxLevel); @@ -127,19 +152,19 @@ public: heteroprio->buckets[prioL2LAtLevel].valide_archs |= STARPU_CPU; } } - assert(cpuCountPrio == prioP2P); // True if CPU support all TODO + // assert(cpuCountPrio == prioP2P); // True if CPU support all TODO //prioP2P = prioL2L + (treeHeight-3)*2+1 +1; //prioP2PExtern = prioP2P; //prioP2PMpi = prioP2P; - if(capacities->supportP2P(FSTARPU_CPU_IDX)){ + if( capacities->supportP2P(FSTARPU_CPU_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioP2P; heteroprio->buckets[prioP2P].valide_archs |= STARPU_CPU; } - assert(cpuCountPrio == prioL2P); // True if CPU support all TODO + //assert(cpuCountPrio == prioL2P); // True if CPU support all TODO //prioL2P = prioP2PMpi+1; - if(capacities->supportL2P(FSTARPU_CPU_IDX)){ + if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CPU_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CPU_IDX][cpuCountPrio++] = prioL2P; heteroprio->buckets[prioL2P].valide_archs |= STARPU_CPU; } @@ -180,7 +205,7 @@ public: //prioP2MSend = 0; //prioP2M = prioP2MSend+1; - if(capacities->supportP2M(FSTARPU_OPENCL_IDX)){ + if( !workOnlyOnLeaves && capacities->supportP2M(FSTARPU_OPENCL_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_OPENCL_IDX][openclCountPrio++] = prioP2MSend; heteroprio->buckets[prioP2MSend].valide_archs |= STARPU_OPENCL; @@ -190,7 +215,7 @@ public: //prioM2MSend = prioP2M+1; //prioM2M = prioM2MSend+1; - if(capacities->supportM2M(FSTARPU_OPENCL_IDX)){ + if( !workOnlyOnLeaves && capacities->supportM2M(FSTARPU_OPENCL_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_OPENCL_IDX][openclCountPrio++] = prioM2MSend; heteroprio->buckets[prioM2MSend].valide_archs |= STARPU_OPENCL; @@ -208,7 +233,7 @@ public: } //prioL2P = prioP2PMpi+1; - if(capacities->supportL2P(FSTARPU_OPENCL_IDX)){ + if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_OPENCL_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_OPENCL_IDX][openclCountPrio++] = prioL2P; heteroprio->buckets[prioL2P].valide_archs |= STARPU_OPENCL; } @@ -249,7 +274,7 @@ public: //prioP2MSend = 0; //prioP2M = prioP2MSend+1; - if(capacities->supportP2M(FSTARPU_CUDA_IDX)){ + if( !workOnlyOnLeaves && capacities->supportP2M(FSTARPU_CUDA_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][openclCountPrio++] = prioP2MSend; heteroprio->buckets[prioP2MSend].valide_archs |= STARPU_CUDA; @@ -259,7 +284,7 @@ public: //prioM2MSend = prioP2M+1; //prioM2M = prioM2MSend+1; - if(capacities->supportM2M(FSTARPU_CUDA_IDX)){ + if( !workOnlyOnLeaves && capacities->supportM2M(FSTARPU_CUDA_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][openclCountPrio++] = prioM2MSend; heteroprio->buckets[prioM2MSend].valide_archs |= STARPU_CUDA; @@ -277,7 +302,7 @@ public: } //prioL2P = prioP2PMpi+1; - if(capacities->supportL2P(FSTARPU_CUDA_IDX)){ + if( !workOnlyOnLeaves && capacities->supportL2P(FSTARPU_CUDA_IDX)){ heteroprio->prio_mapping_per_arch_index[FSTARPU_CUDA_IDX][openclCountPrio++] = prioL2P; heteroprio->buckets[prioL2P].valide_archs |= STARPU_CUDA; } diff --git a/Tests/noDist/testBlockedAlgorithm.cpp b/Tests/noDist/testBlockedAlgorithm.cpp index bc41c0e8286c009564ed16f78f078197f1d96108..2d542a7fe94b647a71e8885695fe9bcb1a39de5c 100644 --- a/Tests/noDist/testBlockedAlgorithm.cpp +++ b/Tests/noDist/testBlockedAlgorithm.cpp @@ -48,7 +48,8 @@ int main(int argc, char* argv[]){ "The size of the block of the blocked tree" }; FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.", - FParameterDefinitions::OctreeHeight, FParameterDefinitions::NbParticles, LocalOptionBlocSize); + FParameterDefinitions::OctreeHeight, FParameterDefinitions::NbParticles, + FParameterDefinitions::OctreeSubHeight, LocalOptionBlocSize); typedef double FReal; @@ -101,7 +102,8 @@ int main(int argc, char* argv[]){ FAssertLF(loader.isOpen()); // Usual octree - OctreeClass tree(NbLevels, 2, loader.getBoxWidth(), loader.getCenterOfBox()); + OctreeClass tree(NbLevels, FParameters::getValue(argc,argv,FParameterDefinitions::OctreeSubHeight.options, 2), + loader.getBoxWidth(), loader.getCenterOfBox()); FTestParticleContainer<FReal> allParticles; for(FSize idxPart = 0 ; idxPart < loader.getNumberOfParticles() ; ++idxPart){