Commit 0763df33 authored by COULAUD Olivier's avatar COULAUD Olivier
Browse files

Merge branch 'master' of git+ssh://scm.gforge.inria.fr//gitroot/scalfmm/scalfmm

Conflicts:
	Src/Core/FFmmAlgorithmThreadProc.hpp
parents 9d74335f 6088cd2f
// ===================================================================================
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Berenger Bramas, Matthias Messner
// Copyright ScalFmm 2011 INRIA, Olivier Coulaud, Bérenger Bramas, Matthias Messner
// olivier.coulaud@inria.fr, berenger.bramas@inria.fr
// This software is a computer program whose purpose is to compute the FMM.
//
......@@ -36,7 +36,7 @@
#include "../Containers/FMpiBufferReader.hpp"
#include "../Utils/FMpi.hpp"
#include <sys/time.h>
#include "FCoreCommon.hpp"
......@@ -62,1296 +62,2373 @@
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadProc : public FAbstractAlgorithm {
// Can be deleted
// const static int MaxSizePerCell = CellClass::GetSize();
OctreeClass* const tree; //< The octree to work on
KernelClass** kernels; //< The kernels
// Can be deleted
// const static int MaxSizePerCell = CellClass::GetSize();
OctreeClass* const tree; //< The octree to work on
KernelClass** kernels; //< The kernels
const FMpi::FComm& comm; //< MPI comm
typename OctreeClass::Iterator* iterArray; //Will be used to store pointers to cells/leafs to work with
typename OctreeClass::Iterator* iterArrayComm; //Will be used to store pointers to cells/leafs to send/rcv
int numberOfLeafs; //< To store the size at the previous level
const int MaxThreads; //< the max number of thread allowed by openmp
const int nbProcess; //< Number of process
const int idProcess; //< Id of current process
const int OctreeHeight; //<Height of the tree
/** An interval is the morton index interval
* that a proc use (it holds data in this interval)
*/
struct Interval{
MortonIndex min;
MortonIndex max;
};
/** My interval */
Interval*const intervals;
/** All process intervals */
Interval*const workingIntervalsPerLevel;
/** Get an interval from proc id and level */
Interval& getWorkingInterval( int level, int proc){
return workingIntervalsPerLevel[OctreeHeight * proc + level];
}
const FMpi::FComm& comm; //< MPI comm
typename OctreeClass::Iterator* iterArray; //
int numberOfLeafs; //< To store the size at the previous level
public:
/** Get current proc interval at level */
Interval& getWorkingInterval( int level){
return getWorkingInterval(level, idProcess);
}
/** Does the current proc has some work at this level */
bool hasWorkAtLevel( int level){
return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).max) < (getWorkingInterval(level, idProcess).max);
}
/** The constructor need the octree and the kernels used for computation
* @param inTree the octree to work on
* @param inKernels the kernels to call
* An assert is launched if one of the arguments is null
*/
FFmmAlgorithmThreadProc(const FMpi::FComm& inComm, OctreeClass* const inTree, KernelClass* const inKernels)
: tree(inTree) , kernels(0), comm(inComm), iterArray(nullptr),iterArrayComm(nullptr),numberOfLeafs(0),
MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
OctreeHeight(tree->getHeight()),intervals(new Interval[inComm.processCount()]),
workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()])
{
FAssertLF(tree, "tree cannot be null");
this->kernels = new KernelClass*[MaxThreads];
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
this->kernels[idxThread] = new KernelClass(*inKernels);
}
FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n");
FLOG(FLog::Controller << "Max threads = " << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
}
/** Default destructor */
virtual ~FFmmAlgorithmThreadProc(){
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
printf("Delete %d\n",idxThread);
delete this->kernels[idxThread];
}
delete [] this->kernels;
delete [] intervals;
delete [] workingIntervalsPerLevel;
}
/**
* To execute the fmm algorithm
* Call this function to run the complete algorithm
*/
void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
FTRACE( FTrace::FFunction functionTrace( __FUNCTION__, "Fmm" , __FILE__ , __LINE__ ) );
// Count leaf
this->numberOfLeafs = 0;
{
FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
Interval myLastInterval;
{
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
myLastInterval.min = octreeIterator.getCurrentGlobalIndex();
do{
++this->numberOfLeafs;
} while(octreeIterator.moveRight());
myLastInterval.max = octreeIterator.getCurrentGlobalIndex();
}
iterArray = new typename OctreeClass::Iterator[numberOfLeafs];
iterArrayComm = new typename OctreeClass::Iterator[numberOfLeafs];
FAssertLF(iterArray, "iterArray bad alloc");
FAssertLF(iterArrayComm, "iterArrayComm bad alloc");
// We get the min/max indexes from each procs
FMpi::MpiAssert( MPI_Allgather( &myLastInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()), __LINE__ );
Interval*const myIntervals = new Interval[OctreeHeight];
myIntervals[OctreeHeight - 1] = myLastInterval;
for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
myIntervals[idxLevel].min = myIntervals[idxLevel+1].min >> 3;
myIntervals[idxLevel].max = myIntervals[idxLevel+1].max >> 3;
}
if(idProcess != 0){
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
MortonIndex currentLimit = intervals[idProcess-1].max >> 3;
for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){
while(octreeIterator.getCurrentGlobalIndex() <= currentLimit){
if( !octreeIterator.moveRight() ) break;
}
myIntervals[idxLevel].min = octreeIterator.getCurrentGlobalIndex();
octreeIterator.moveUp();
currentLimit >>= 3;
}
}
printf("Proc::%d From leaf %lld to leaf %lld\n",idProcess,myLastInterval.min,myLastInterval.max);
// We get the min/max indexes from each procs
FMpi::MpiAssert( MPI_Allgather( myIntervals, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()), __LINE__ );
delete[] myIntervals;
}
const int MaxThreads; //< the max number of thread allowed by openmp
// run;
if(operationsToProceed & FFmmP2M) bottomPass();
const int nbProcess; //< Number of process
const int idProcess; //< Id of current process
if(operationsToProceed & FFmmM2M) upwardPass();
const int OctreeHeight; //<Height of the tree
if(operationsToProceed & FFmmM2L) transferPassOld();
if(operationsToProceed & FFmmL2L) downardPass();
/** An interval is the morton index interval
* that a proc use (it holds data in this interval)
*/
struct Interval{
MortonIndex min;
MortonIndex max;
};
/** My interval */
Interval*const intervals;
/** All process intervals */
Interval*const workingIntervalsPerLevel;
if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPassOld();
/** Get an interval from proc id and level */
Interval& getWorkingInterval( int level, int proc){
return workingIntervalsPerLevel[OctreeHeight * proc + level];
}
// delete array
delete [] iterArray;
delete [] iterArrayComm;
iterArray = 0;
iterArrayComm = 0;
}
public:
/** Get current proc interval at level */
Interval& getWorkingInterval( int level){
return getWorkingInterval(level, idProcess);
}
private:
/** Does the current proc has some work at this level */
bool hasWorkAtLevel( int level){
return idProcess == 0 || (getWorkingInterval(level, idProcess - 1).max) < (getWorkingInterval(level, idProcess).max);
/////////////////////////////////////////////////////////////////////////////
// P2M
/////////////////////////////////////////////////////////////////////////////
/** P2M Bottom Pass */
void bottomPass(){
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
typename OctreeClass::Iterator octreeIterator(tree);
// Iterate on leafs
octreeIterator.gotoBottomLeft();
int leafs = 0;
do{
iterArray[leafs++] = octreeIterator;
} while(octreeIterator.moveRight());
FLOG(computationCounter.tic());
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
#pragma omp for nowait
for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
}
}
FLOG(computationCounter.tac());
FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = " << counterTime.tacAndElapsed() << " s)\n" );
FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
}
/////////////////////////////////////////////////////////////////////////////
// Upward
/////////////////////////////////////////////////////////////////////////////
/** M2M */
void upwardPassOld(){
const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
FLOG(FTic prepareCounter);
FLOG(FTic waitCounter);
// Start from leal level - 1
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// This variable is the proc responsible
// of the shared cells
int sendToProc = idProcess;
// There are a maximum of 8-1 sends and 8-1 receptions
MPI_Request requests[14];
MPI_Status status[14];
// Maximum data per message is:
FMpiBufferWriter sendBuffer(comm.getComm(),7*MaxSizePerCell);
const int recvBufferOffset = (8 * MaxSizePerCell + 1);
FMpiBufferReader recvBuffer(comm.getComm(), nbProcess*recvBufferOffset);
CellClass recvBufferCells[8];
int firstProcThatSend = idProcess + 1;
// for each levels
for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
// No more work for me
if(idProcess != 0
&& getWorkingInterval((idxLevel+1), idProcess).max <= getWorkingInterval((idxLevel+1), idProcess - 1).max){
break;
}
// copy cells to work with
int numberOfCells = 0;
// for each cells
do{
iterArray[numberOfCells++] = octreeIterator;
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveUp();
octreeIterator = avoidGotoLeftIterator;
// We may need to send something
int iterRequests = 0;
int cellsToSend = -1;
while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).min){
++cellsToSend;
}
FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
FLOG(prepareCounter.tic());
if(idProcess != 0
&& (getWorkingInterval((idxLevel+1), idProcess).min >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).max >>3)){
char state = 0;
sendBuffer.write(state);
const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild();
for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).min <= child[idxChild]->getMortonIndex() ){
child[idxChild]->serializeUp(sendBuffer);
state = char(state | (0x1 << idxChild));
}
}
/** The constructor need the octree and the kernels used for computation
* @param inTree the octree to work on
* @param inKernels the kernels to call
* An assert is launched if one of the arguments is null
*/
FFmmAlgorithmThreadProc(const FMpi::FComm& inComm, OctreeClass* const inTree, KernelClass* const inKernels)
: tree(inTree) , kernels(0), comm(inComm), iterArray(nullptr),numberOfLeafs(0),
MaxThreads(omp_get_max_threads()), nbProcess(inComm.processCount()), idProcess(inComm.processId()),
OctreeHeight(tree->getHeight()),intervals(new Interval[inComm.processCount()]),
workingIntervalsPerLevel(new Interval[inComm.processCount() * tree->getHeight()])
{
FAssertLF(tree, "tree cannot be null");
this->kernels = new KernelClass*[MaxThreads];
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
this->kernels[idxThread] = new KernelClass(*inKernels);
}
FLOG(FLog::Controller << "FFmmAlgorithmThreadProc\n");
FLOG(FLog::Controller << "Max threads = " << MaxThreads << ", Procs = " << nbProcess << ", I am " << idProcess << ".\n");
sendBuffer.writeAt(0,state);
while( sendToProc && iterArray[cellsToSend].getCurrentGlobalIndex() <= getWorkingInterval(idxLevel , sendToProc - 1).max){
--sendToProc;
}
/** Default destructor */
virtual ~FFmmAlgorithmThreadProc(){
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
delete this->kernels[idxThread];
}
delete [] this->kernels;
delete [] intervals;
delete [] workingIntervalsPerLevel;
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, sendToProc,
FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]);
}
// We may need to receive something
bool hasToReceive = false;
int endProcThatSend = firstProcThatSend;
if(idProcess != nbProcess - 1){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M
while(firstProcThatSend < nbProcess
&& (getWorkingInterval((idxLevel+1), firstProcThatSend).max) <= (getWorkingInterval((idxLevel+1), idProcess).max)){
// Second condition :: while firstProcThatSend max morton index is < to myself max interval
++firstProcThatSend;
}
/**
* To execute the fmm algorithm
* Call this function to run the complete algorithm
*/
void execute(const unsigned operationsToProceed = FFmmNearAndFarFields){
FTRACE( FTrace::FFunction functionTrace( __FUNCTION__, "Fmm" , __FILE__ , __LINE__ ) );
// Count leaf
this->numberOfLeafs = 0;
{
FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
Interval myLastInterval;
{
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
myLastInterval.min = octreeIterator.getCurrentGlobalIndex();
do{
++this->numberOfLeafs;
} while(octreeIterator.moveRight());
myLastInterval.max = octreeIterator.getCurrentGlobalIndex();
}
iterArray = new typename OctreeClass::Iterator[numberOfLeafs];
FAssertLF(iterArray, "iterArray bad alloc");
// We get the min/max indexes from each procs
FMpi::MpiAssert( MPI_Allgather( &myLastInterval, sizeof(Interval), MPI_BYTE, intervals, sizeof(Interval), MPI_BYTE, comm.getComm()), __LINE__ );
Interval*const myIntervals = new Interval[OctreeHeight];
myIntervals[OctreeHeight - 1] = myLastInterval;
for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 0 ; --idxLevel){
myIntervals[idxLevel].min = myIntervals[idxLevel+1].min >> 3;
myIntervals[idxLevel].max = myIntervals[idxLevel+1].max >> 3;
}
if(idProcess != 0){
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
MortonIndex currentLimit = intervals[idProcess-1].max >> 3;
for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){
while(octreeIterator.getCurrentGlobalIndex() <= currentLimit){
if( !octreeIterator.moveRight() ) break;
}
myIntervals[idxLevel].min = octreeIterator.getCurrentGlobalIndex();
octreeIterator.moveUp();
currentLimit >>= 3;
}
}
printf("Proc::%d From leaf %lld to leaf %lld\n",idProcess,myLastInterval.min,myLastInterval.max);
// We get the min/max indexes from each procs
FMpi::MpiAssert( MPI_Allgather( myIntervals, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE,
workingIntervalsPerLevel, int(sizeof(Interval)) * OctreeHeight, MPI_BYTE, comm.getComm()), __LINE__ );
delete[] myIntervals;
}
// run;
if(operationsToProceed & FFmmP2M) bottomPass();
if(operationsToProceed & FFmmM2M) upwardPass();
if(firstProcThatSend < nbProcess &&
(getWorkingInterval((idxLevel+1), firstProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3) ){
if(operationsToProceed & FFmmM2L) transferPass();
endProcThatSend = firstProcThatSend;
if(operationsToProceed & FFmmL2L) downardPass();
while( endProcThatSend < nbProcess &&
(getWorkingInterval((idxLevel+1) ,endProcThatSend).min >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).max>>3)){
++endProcThatSend;
}
if((operationsToProceed & FFmmP2P) || (operationsToProceed & FFmmL2P)) directPass();
// delete array
delete [] iterArray;
iterArray = 0;
if(firstProcThatSend != endProcThatSend){
hasToReceive = true;
for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){
MPI_Irecv(&recvBuffer.data()[idxProc * recvBufferOffset], recvBufferOffset, MPI_PACKED,
idxProc, FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]);
}
}
}
private:
/////////////////////////////////////////////////////////////////////////////
// P2M
/////////////////////////////////////////////////////////////////////////////
/** P2M Bottom Pass */
void bottomPass(){
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Bottom Pass\n").write(FLog::Flush) );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
typename OctreeClass::Iterator octreeIterator(tree);
// Iterate on leafs
octreeIterator.gotoBottomLeft();
int leafs = 0;
do{
iterArray[leafs++] = octreeIterator;
} while(octreeIterator.moveRight());
FLOG(computationCounter.tic());
}
FLOG(prepareCounter.tac());
FTRACE( regionTrace.end() );
// Compute
const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells);
FLOG(computationCounter.tic());
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for nowait
for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
}
}
FLOG(computationCounter.tac());
FLOG( FLog::Controller << "\tFinished (@Bottom Pass (P2M) = " << counterTime.tacAndElapsed() << " s)\n" );
FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.elapsed() << " s\n" );
for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){
myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
// for(int k=0 ; k< 8 ; ++k){
// if(iterArray[idxCell].getCurrentChild()[k]){
// FILE * fd = fopen("ResM2MNearOld","a+");
// fprintf(fd,"%lld\t% lld\t %d\n",iterArray[idxCell].getCurrentCell()->getMortonIndex(),iterArray[idxCell].getCurrentChild()[k]->getMortonIndex(),idxLevel);
// fclose(fd);
// }
//}
}
/////////////////////////////////////////////////////////////////////////////
// Upward
/////////////////////////////////////////////////////////////////////////////
/** M2M */
void upwardPass(){
const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
FLOG(FTic singleCounter);
FLOG(FTic parallelCounter);
// Start from leal level - 1
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// This variable is the proc responsible
// of the shared cells
int sendToProc = idProcess;
// There are a maximum of 8-1 sends and 8-1 receptions
MPI_Request requests[14];
MPI_Status status[14];
// Maximum data per message is:
FMpiBufferWriter sendBuffer(comm.getComm(),7*MaxSizePerCell);
const int recvBufferOffset = (8 * MaxSizePerCell + 1);
FMpiBufferReader recvBuffer(comm.getComm(), nbProcess*recvBufferOffset);
CellClass recvBufferCells[8];
int firstProcThatSend = idProcess + 1;
FLOG(computationCounter.tic());
//Loop for work
for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
if(idProcess != 0
&& getWorkingInterval((idxLevel+1), idProcess).max <= getWorkingInterval((idxLevel+1), idProcess - 1).max){
break;
}
// copy cells to work with
int numberOfCells = 0;
// for each cells
do{
iterArray[numberOfCells++] = octreeIterator;
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveUp();
octreeIterator = avoidGotoLeftIterator;
int cellsToSend = -1;
int iterRequests = 0;
int endIndex = numberOfCells;
//Test if i'm not the last, and I need st to compute my last M2M
if((idProcess != nbProcess-1) && (getWorkingInterval(idxLevel+1,idProcess+1)).min <= getWorkingInterval(idxLevel+1,idProcess).max){
endIndex--;
}
while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).min){
++cellsToSend;
}
FLOG(parallelCounter.tic());
}
FLOG(computationCounter.tac());
// Are we sending or waiting anything?
if(iterRequests){
FLOG(waitCounter.tic());
MPI_Waitall( iterRequests, requests, status);
FLOG(waitCounter.tac());