Commit 3d80b7ee authored by COULAUD Olivier's avatar COULAUD Olivier

Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/scalfmm/scalfmm

# By Berenger Bramas
# Via Berenger Bramas
* 'master' of git+ssh://scm.gforge.inria.fr//gitroot/scalfmm/scalfmm:
  When using starpu number of threads should be given by env variables (and not command line parameter)
  When using starpu number of threads should be given by env variables (and not command line parameter)
  You can specify a ration when we fill the group tree to avoid too much empty area between cells
  debug the scheduler
  debug the scheduler
parents f58ceac8 fa0a4436
......@@ -41,8 +41,14 @@ public:
FAssertLF(inKernels, "kernels cannot be null");
kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static) num_threads(MaxThreads)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
this->kernels[idxThread] = new KernelClass(*inKernels);
// We want to ensure that each thread allocate data close to him
// and that only one thread at a time call the copy constructor
#pragma omp critical (FGroupTaskDepAlgorithm_InitKernels)
{
this->kernels[idxThread] = new KernelClass(*inKernels);
}
}
FLOG(FLog::Controller << "FGroupTaskAlgorithm (Max Thread " << MaxThreads << ")\n");
......@@ -58,7 +64,7 @@ public:
void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
FLOG( FLog::Controller << "\tStart FGroupTaskAlgorithm\n" );
#pragma omp parallel
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp single nowait
{
......@@ -67,7 +73,7 @@ public:
}
}
#pragma omp parallel
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp single nowait
{
......
......@@ -43,7 +43,7 @@ public:
FAssertLF(inKernels, "kernels cannot be null");
kernels = new KernelClass*[MaxThreads];
#pragma omp parallel for schedule(static)
#pragma omp parallel for schedule(static) num_threads(MaxThreads)
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
// We want to ensure that each thread allocate data close to him
// and that only one thread at a time call the copy constructor
......@@ -66,7 +66,7 @@ public:
void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
FLOG( FLog::Controller << "\tStart FGroupTaskDepAlgorithm\n" );
#pragma omp parallel
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp single nowait
{
......@@ -75,7 +75,7 @@ public:
}
}
#pragma omp parallel
#pragma omp parallel num_threads(MaxThreads)
{
#pragma omp single nowait
{
......
......@@ -80,7 +80,6 @@ protected:
std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;
int MaxThreads; //< The number of threads
OctreeClass*const tree; //< The Tree
KernelClass*const originalCpuKernel;
......@@ -112,8 +111,8 @@ protected:
FStarPUPtrInterface* wrapperptr;
public:
FGroupTaskStarPUAlgorithm(OctreeClass*const inTree, KernelClass* inKernels, const int inMaxThreads = -1)
: MaxThreads(inMaxThreads), tree(inTree), originalCpuKernel(inKernels),
FGroupTaskStarPUAlgorithm(OctreeClass*const inTree, KernelClass* inKernels)
: tree(inTree), originalCpuKernel(inKernels),
cellHandles(nullptr),
#ifdef STARPU_USE_CPU
cpuWrapper(tree->getHeight()),
......@@ -127,7 +126,6 @@ public:
wrapperptr(&wrappers){
FAssertLF(tree, "tree cannot be null");
FAssertLF(inKernels, "kernels cannot be null");
FAssertLF(MaxThreads <= STARPU_MAXCPUS, "number of threads to high");
struct starpu_conf conf;
FAssertLF(starpu_conf_init(&conf) == 0);
......@@ -164,8 +162,6 @@ public:
starpu_pause();
MaxThreads = starpu_worker_get_count();//starpu_cpu_worker_get_count();
cellHandles = new std::vector<CellHandles>[tree->getHeight()];
initCodelet();
......
......@@ -95,7 +95,6 @@ protected:
std::vector< std::vector< std::vector<BlockInteractions<CellContainerClass>>>> externalInteractionsAllLevel;
std::vector< std::vector<BlockInteractions<ParticleGroupClass>>> externalInteractionsLeafLevel;
int MaxThreads; //< The number of threads
OctreeClass*const tree; //< The Tree
KernelClass*const originalCpuKernel;
......@@ -129,8 +128,8 @@ protected:
FStarPUPtrInterface* wrapperptr;
public:
FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels, const int inMaxThreads = -1)
: comm(inComm), MaxThreads(inMaxThreads), tree(inTree), originalCpuKernel(inKernels),
FGroupTaskStarPUMpiAlgorithm(const FMpi::FComm& inComm, OctreeClass*const inTree, KernelClass* inKernels)
: comm(inComm), tree(inTree), originalCpuKernel(inKernels),
cellHandles(nullptr),
#ifdef STARPU_USE_CPU
cpuWrapper(tree->getHeight()),
......@@ -144,7 +143,6 @@ public:
wrapperptr(&wrappers){
FAssertLF(tree, "tree cannot be null");
FAssertLF(inKernels, "kernels cannot be null");
FAssertLF(MaxThreads <= STARPU_MAXCPUS, "number of threads to high");
struct starpu_conf conf;
FAssertLF(starpu_conf_init(&conf) == 0);
......@@ -182,8 +180,6 @@ public:
starpu_pause();
MaxThreads = starpu_worker_get_count();//starpu_cpu_worker_get_count();
cellHandles = new std::vector<CellHandles>[tree->getHeight()];
initCodelet();
......
......@@ -384,6 +384,9 @@ public:
boxCenter(inBoxCenter), boxCorner(inBoxCenter,-(inBoxWidth/2)), boxWidth(inBoxWidth),
boxWidthAtLeafLevel(inBoxWidth/FReal(1<<(inTreeHeight-1))){
FAssertLF(inCoverRatio == 0.0 || oneParent == true, "If a ratio is choosen oneParent should be turned on");
const bool userCoverRatio = (inCoverRatio != 0.0);
cellBlocksPerLevel = new std::vector<CellGroupClass*>[treeHeight];
MortonIndex* currentBlockIndexes = new MortonIndex[nbElementsPerBlock];
......@@ -430,7 +433,11 @@ public:
int sizeOfBlock = 0;
int lastParticle = firstParticle;
// Count until end of sub group is reached or we have enough cells
while(sizeOfBlock < nbElementsPerBlock && lastParticle < nbParticles){
while(sizeOfBlock < nbElementsPerBlock && lastParticle < nbParticles
&& (userCoverRatio == false
|| sizeOfBlock == 0
|| currentBlockIndexes[sizeOfBlock-1] == particlesToSort[lastParticle].mindex
|| (FReal(sizeOfBlock+1)/FReal(particlesToSort[lastParticle].mindex-particlesToSort[firstParticle].mindex)) >= inCoverRatio)){
if(sizeOfBlock == 0 || currentBlockIndexes[sizeOfBlock-1] != particlesToSort[lastParticle].mindex){
currentBlockIndexes[sizeOfBlock] = particlesToSort[lastParticle].mindex;
nbParticlesPerLeaf[sizeOfBlock] = 1;
......@@ -581,7 +588,7 @@ public:
}
}
// If we are at the end of the sub group, move to next (otherwise we have consume a part of it)
if((*iterChildCells)->getEndingIndex() <= currentCellIndex){
while(iterChildCells != iterChildEndCells && (*iterChildCells)->getEndingIndex() <= currentCellIndex){
++iterChildCells;
// Update morton index
if(iterChildCells != iterChildEndCells && currentCellIndex < (*iterChildCells)->getStartingIndex()){
......@@ -612,9 +619,6 @@ public:
sizeOfBlock = 0;
}
else{
assert(iterChildCells == iterChildEndCells);
}
}
}
}
......
......@@ -192,6 +192,7 @@ static void initialize_heteroprio_center_policy(unsigned sched_ctx_id)
#endif
/* Alloc the scheduler data */
struct _starpu_heteroprio_center_policy_heteroprio* heteroprio = (struct _starpu_heteroprio_center_policy_heteroprio*)malloc(sizeof(struct _starpu_heteroprio_center_policy_heteroprio));
memset(heteroprio, 0, sizeof(*heteroprio));
heteroprio->waiters = starpu_bitmap_create();
starpu_sched_ctx_set_policy_data(sched_ctx_id, (void*)heteroprio);
STARPU_PTHREAD_MUTEX_INIT(&heteroprio->policy_mutex, NULL);
......@@ -407,6 +408,10 @@ static struct starpu_task *pop_task_heteroprio_policy(unsigned sched_ctx_id)
if( heteroprio->nb_remaining_tasks_per_arch_index[worker->arch_index] != 0 ){
/* Ideally we would like to fill the prefetch array */
unsigned nb_tasks_to_prefetch = (HETEROPRIO_MAX_PREFETCH-worker->tasks_queue_size);
/* But there are maybe less tasks than that! */
if(nb_tasks_to_prefetch > heteroprio->nb_remaining_tasks_per_arch_index[worker->arch_index]){
nb_tasks_to_prefetch = heteroprio->nb_remaining_tasks_per_arch_index[worker->arch_index];
}
/* But in case there are less tasks than worker we take the minimum */
if(heteroprio->nb_remaining_tasks_per_arch_index[worker->arch_index] < heteroprio->nb_workers){
if(worker->tasks_queue_size == 0) nb_tasks_to_prefetch = 1;
......
......@@ -43,13 +43,13 @@
#include "../../Src/Core/FFmmAlgorithm.hpp"
int main(int argc, char* argv[]){
setenv("STARPU_NCPU","1",1);
const FParameterNames LocalOptionBlocSize {
{"-bs"},
"The size of the block of the blocked tree"
};
FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
FParameterDefinitions::OctreeHeight, FParameterDefinitions::NbThreads,
FParameterDefinitions::NbParticles, LocalOptionBlocSize);
FParameterDefinitions::OctreeHeight, FParameterDefinitions::NbParticles, LocalOptionBlocSize);
typedef double FReal;
......@@ -69,8 +69,6 @@ int main(int argc, char* argv[]){
typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
#elif defined(SCALFMM_USE_OMP4)
typedef FTestKernels< GroupCellClass, GroupContainerClass > GroupKernelClass;
// Set the number of threads
omp_set_num_threads(FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, omp_get_max_threads()));
typedef FGroupTaskDepAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupCellClass,
GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass > GroupAlgorithm;
#else
......@@ -121,8 +119,9 @@ int main(int argc, char* argv[]){
// Put the data into the tree
//GroupOctreeClass groupedTree(NbLevels, groupSize, &tree);
GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles);
//GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles);
//GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles, false, true);
GroupOctreeClass groupedTree(NbLevels, loader.getBoxWidth(), loader.getCenterOfBox(), groupSize, &allParticles, false, true, 0.2);
groupedTree.printInfoBlocks();
// Check tree structure at leaf level
......
......@@ -54,7 +54,6 @@ int main(int argc, char* argv[]){
#else
FParameterDefinitions::InputFile,
#endif
FParameterDefinitions::NbThreads,
LocalOptionBlocSize, LocalOptionNoValidate);
// Initialize the types
......@@ -76,8 +75,6 @@ int main(int argc, char* argv[]){
typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
#elif defined(SCALFMM_USE_OMP4)
typedef FChebSymKernel<FReal,GroupCellClass,GroupContainerClass,MatrixKernelClass,ORDER> GroupKernelClass;
// Set the number of threads
omp_set_num_threads(FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, omp_get_max_threads()));
typedef FGroupTaskDepAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupCellClass,
GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass > GroupAlgorithm;
#else
......
......@@ -52,8 +52,8 @@ int main(int argc, char* argv[]){
"The size of the block of the blocked tree"
};
FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
FParameterDefinitions::OctreeHeight, FParameterDefinitions::NbThreads,
FParameterDefinitions::NbParticles, FParameterDefinitions::NbThreads,
FParameterDefinitions::OctreeHeight,
FParameterDefinitions::NbParticles,
LocalOptionBlocSize);
typedef double FReal;
// Initialize the types
......@@ -72,7 +72,6 @@ int main(int argc, char* argv[]){
FMpi mpiComm(argc, argv);
// Get params
const int maxThreads = FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, -1);
const int NbLevels = FParameters::getValue(argc,argv,FParameterDefinitions::OctreeHeight.options, 5);
const int NbParticles = FParameters::getValue(argc,argv,FParameterDefinitions::NbParticles.options, 20);
const int groupSize = FParameters::getValue(argc,argv,LocalOptionBlocSize.options, 250);
......@@ -141,7 +140,7 @@ int main(int argc, char* argv[]){
// Run the algorithm
GroupKernelClass groupkernel;
GroupAlgorithm groupalgo(mpiComm.global(), &groupedTree,&groupkernel,maxThreads);
GroupAlgorithm groupalgo(mpiComm.global(), &groupedTree,&groupkernel);
groupalgo.execute();
groupedTree.forEachCellLeaf<GroupContainerClass>([&](GroupCellClass cell, GroupContainerClass* leaf){
......
......@@ -56,7 +56,7 @@ int main(int argc, char* argv[]){
const FParameterNames LocalOptionNoValidate { {"-no-validation"}, "To avoid comparing with direct computation"};
FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
FParameterDefinitions::OctreeHeight,FParameterDefinitions::InputFile,
FParameterDefinitions::OctreeSubHeight, FParameterDefinitions::NbThreads,
FParameterDefinitions::OctreeSubHeight,
LocalOptionBlocSize, LocalOptionNoValidate);
typedef double FReal;
......@@ -86,7 +86,6 @@ int main(int argc, char* argv[]){
const char* const filename = FParameters::getStr(argc,argv,FParameterDefinitions::InputFile.options, "../Data/test20k.fma");
const unsigned int TreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeHeight.options, 5);
const unsigned int SubTreeHeight = FParameters::getValue(argc, argv, FParameterDefinitions::OctreeSubHeight.options, 2);
const unsigned int NbThreads = FParameters::getValue(argc, argv, FParameterDefinitions::NbThreads.options, 1);
// init particles position and physical value
struct TestParticle{
......@@ -164,7 +163,7 @@ int main(int argc, char* argv[]){
const MatrixKernelClass MatrixKernel;
GroupKernelClass groupkernel(TreeHeight, loader.getBoxWidth(), loader.getCenterOfBox(), &MatrixKernel);
// Run the algorithm
GroupAlgorithm groupalgo(mpiComm.global(), &groupedTree,&groupkernel,NbThreads);
GroupAlgorithm groupalgo(mpiComm.global(), &groupedTree,&groupkernel);
groupalgo.execute();
timer.tac();
......
......@@ -43,7 +43,6 @@ int main(int argc, char* argv[]){
const FParameterNames LocalOptionNoValidate { {"-no-validation"}, "To avoid comparing with direct computation"};
FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
FParameterDefinitions::OctreeHeight,FParameterDefinitions::InputFile,
FParameterDefinitions::NbThreads,
FParameterDefinitions::NbParticles, LocalOptionBlocSize, LocalOptionNoValidate);
// Initialize the types
......@@ -62,8 +61,6 @@ int main(int argc, char* argv[]){
typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
#elif defined(SCALFMM_USE_OMP4)
typedef FRotationKernel< FReal, GroupCellClass, GroupContainerClass , P> GroupKernelClass;
// Set the number of threads
omp_set_num_threads(FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, omp_get_max_threads()));
typedef FGroupTaskDepAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupCellClass,
GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass > GroupAlgorithm;
#else
......
......@@ -43,7 +43,6 @@ int main(int argc, char* argv[]){
const FParameterNames LocalOptionNoValidate { {"-no-validation"}, "To avoid comparing with direct computation"};
FHelpDescribeAndExit(argc, argv, "Test the blocked tree by counting the particles.",
FParameterDefinitions::OctreeHeight,FParameterDefinitions::InputFile,
FParameterDefinitions::NbThreads,
FParameterDefinitions::NbParticles, LocalOptionBlocSize, LocalOptionNoValidate);
// Initialize the types
......@@ -62,8 +61,6 @@ int main(int argc, char* argv[]){
typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
#elif defined(SCALFMM_USE_OMP4)
typedef FTaylorKernel< FReal,GroupCellClass, GroupContainerClass , P,1> GroupKernelClass;
// Set the number of threads
omp_set_num_threads(FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, omp_get_max_threads()));
typedef FGroupTaskDepAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupCellClass,
GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass > GroupAlgorithm;
#else
......
......@@ -111,8 +111,6 @@ int main(int argc, char* argv[]){
typedef FGroupTaskStarPUAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupCpuWrapper > GroupAlgorithm;
#elif defined(SCALFMM_USE_OMP4)
typedef FRotationKernel< FReal, GroupCellClass, GroupContainerClass , P> GroupKernelClass;
// Set the number of threads
omp_set_num_threads(FParameters::getValue(argc,argv,FParameterDefinitions::NbThreads.options, omp_get_max_threads()));
typedef FGroupTaskDepAlgorithm<GroupOctreeClass, typename GroupOctreeClass::CellGroupClass, GroupCellClass,
GroupCellSymbClass, GroupCellUpClass, GroupCellDownClass, GroupKernelClass, typename GroupOctreeClass::ParticleGroupClass, GroupContainerClass > GroupAlgorithm;
#else
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment