Mentions légales du service

Skip to content
Snippets Groups Projects
Commit b095fe94 authored by BRAMAS Berenger's avatar BRAMAS Berenger
Browse files

Update debug and clean MPI core algo

parent 76859146
No related branches found
No related tags found
No related merge requests found
...@@ -99,6 +99,25 @@ class FFmmAlgorithmThreadProc : public FAbstractAlgorithm { ...@@ -99,6 +99,25 @@ class FFmmAlgorithmThreadProc : public FAbstractAlgorithm {
return workingIntervalsPerLevel[OctreeHeight * proc + level]; return workingIntervalsPerLevel[OctreeHeight * proc + level];
} }
const Interval& getWorkingInterval( int level, int proc) const {
return workingIntervalsPerLevel[OctreeHeight * proc + level];
}
/** To know if a proc has work at a given level (if it hold cells and was responsible of them) */
bool procHasWorkAtLevel(const int idxLevel , const int idxProc) const {
return getWorkingInterval(idxLevel, idxProc).leftIndex <= getWorkingInterval(idxLevel, idxProc).rightIndex;
}
/** Return true if the idxProc left cell at idxLevel+1 has the same parent as us for our right cell */
bool procCoversMyRightBorderCell(const int idxLevel , const int idxProc) const {
return (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) == (getWorkingInterval((idxLevel+1) ,idxProc).leftIndex >>3);
}
/** Return true if the idxProc right cell at idxLevel+1 has the same parent as us for our left cell */
bool procCoversMyLeftBorderCell(const int idxLevel , const int idxProc) const {
return (getWorkingInterval((idxLevel+1) , idxProc).rightIndex >>3) == (getWorkingInterval((idxLevel+1) , idProcess).leftIndex>>3);
}
public: public:
/** Get current proc interval at level */ /** Get current proc interval at level */
Interval& getWorkingInterval( int level){ Interval& getWorkingInterval( int level){
...@@ -135,7 +154,6 @@ public: ...@@ -135,7 +154,6 @@ public:
/** Default destructor */ /** Default destructor */
virtual ~FFmmAlgorithmThreadProc(){ virtual ~FFmmAlgorithmThreadProc(){
for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){ for(int idxThread = 0 ; idxThread < MaxThreads ; ++idxThread){
printf("Delete %d\n",idxThread);
delete this->kernels[idxThread]; delete this->kernels[idxThread];
} }
delete [] this->kernels; delete [] this->kernels;
...@@ -230,7 +248,7 @@ public: ...@@ -230,7 +248,7 @@ public:
if(operationsToProceed & FFmmM2M) upwardPass(); if(operationsToProceed & FFmmM2M) upwardPass();
if(operationsToProceed & FFmmM2L) transferPassOld(); if(operationsToProceed & FFmmM2L) transferPass();
if(operationsToProceed & FFmmL2L) downardPass(); if(operationsToProceed & FFmmL2L) downardPass();
...@@ -266,10 +284,10 @@ private: ...@@ -266,10 +284,10 @@ private:
} while(octreeIterator.moveRight()); } while(octreeIterator.moveRight());
FLOG(computationCounter.tic()); FLOG(computationCounter.tic());
#pragma omp parallel #pragma omp parallel
{ {
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()]; KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
#pragma omp for nowait #pragma omp for nowait
for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){ for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc()); myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
} }
...@@ -283,199 +301,6 @@ private: ...@@ -283,199 +301,6 @@ private:
// Upward // Upward
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
/** M2M */
void upwardPassOld(){
const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Upward Pass\n").write(FLog::Flush); );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
FLOG(FTic prepareCounter);
FLOG(FTic waitCounter);
// Start from leal level - 1
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// This variable is the proc responsible
// of the shared cells
int currentProcIdToSendTo = idProcess;
// There are a maximum of 8-1 sends and 8-1 receptions
MPI_Request requests[14];
MPI_Status status[14];
// Maximum data per message is:
FMpiBufferWriter sendBuffer(comm.getComm(),7*MaxSizePerCell);
const int recvBufferOffset = (8 * MaxSizePerCell + 1);
FMpiBufferReader recvBuffer(comm.getComm(), nbProcess*recvBufferOffset);
CellClass recvBufferCells[8];
int firstProcThatSend = idProcess + 1;
// for each levels
for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
// No more work for me
if(idProcess != 0 && getWorkingInterval((idxLevel+1), idProcess).rightIndex <= getWorkingInterval((idxLevel+1), idProcess - 1).rightIndex){
break;
}
// copy cells to work with
int numberOfCells = 0;
// for each cells
do{
iterArray[numberOfCells++] = octreeIterator;
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveUp();
octreeIterator = avoidGotoLeftIterator;
// We may need to send something
int iterRequests = 0;
int cellsToSend = -1;
while(iterArray[cellsToSend+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
++cellsToSend;
}
FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
FLOG(prepareCounter.tic());
if(idProcess != 0 && (getWorkingInterval((idxLevel+1), idProcess).leftIndex >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).rightIndex >>3)){
char state = 0;
sendBuffer.write(state);
const CellClass* const* const child = iterArray[cellsToSend].getCurrentChild();
for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
child[idxChild]->serializeUp(sendBuffer);
state = char(state | (0x1 << idxChild));
}
}
sendBuffer.writeAt(0,state);
while( currentProcIdToSendTo && iterArray[cellsToSend].getCurrentGlobalIndex() <= getWorkingInterval(idxLevel , currentProcIdToSendTo - 1).rightIndex){
--currentProcIdToSendTo;
}
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, currentProcIdToSendTo,
FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]);
}
// We may need to receive something
bool hasToReceive = false;
int endProcThatSend = firstProcThatSend;
if(idProcess != nbProcess - 1){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M
while(firstProcThatSend < nbProcess
&& (getWorkingInterval((idxLevel+1), firstProcThatSend).rightIndex) <= (getWorkingInterval((idxLevel+1), idProcess).rightIndex)){
// Second condition :: while firstProcThatSend rightIndex morton index is < to myself rightIndex interval
++firstProcThatSend;
}
if(firstProcThatSend < nbProcess &&
(getWorkingInterval((idxLevel+1), firstProcThatSend).leftIndex >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) ){
endProcThatSend = firstProcThatSend;
while( endProcThatSend < nbProcess &&
(getWorkingInterval((idxLevel+1) ,endProcThatSend).leftIndex >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3)){
++endProcThatSend;
}
if(firstProcThatSend != endProcThatSend){
hasToReceive = true;
for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){
MPI_Irecv(&recvBuffer.data()[idxProc * recvBufferOffset], recvBufferOffset, MPI_PACKED,
idxProc, FMpi::TagFmmM2M, comm.getComm(), &requests[iterRequests++]);
}
}
}
}
FLOG(prepareCounter.tac());
FTRACE( regionTrace.end() );
// Compute
const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells);
FLOG(computationCounter.tic());
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for nowait
for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){
myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
// for(int k=0 ; k< 8 ; ++k){
// if(iterArray[idxCell].getCurrentChild()[k]){
// FILE * fd = fopen("ResM2MNearOld","a+");
// fprintf(fd,"%lld\t% lld\t %d\n",iterArray[idxCell].getCurrentCell()->getMortonIndex(),iterArray[idxCell].getCurrentChild()[k]->getMortonIndex(),idxLevel);
// fclose(fd);
// }
//}
}
}
FLOG(computationCounter.tac());
// Are we sending or waiting anything?
if(iterRequests){
FLOG(waitCounter.tic());
MPI_Waitall( iterRequests, requests, status);
FLOG(waitCounter.tac());
// we were receiving data
if( hasToReceive ){
CellClass* currentChild[8];
memcpy(currentChild, iterArray[numberOfCells - 1].getCurrentChild(), 8 * sizeof(CellClass*));
// retreive data and merge my child and the child from others
for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc){
recvBuffer.seek(idxProc * recvBufferOffset);
int state = int(recvBuffer.getValue<char>());
int position = 0;
while( state && position < 8){
while(!(state & 0x1)){
state >>= 1;
++position;
}
FAssertLF(!currentChild[position], "Already has a cell here");
recvBufferCells[position].deserializeUp(recvBuffer);
currentChild[position] = (CellClass*) &recvBufferCells[position];
state >>= 1;
++position;
}
}
// Finally compute
FLOG(computationCounter.tic());
(*kernels[0]).M2M( iterArray[numberOfCells - 1].getCurrentCell() , currentChild, idxLevel);
FLOG(computationCounter.tac());
firstProcThatSend = endProcThatSend - 1;
}
}
sendBuffer.reset();
recvBuffer.seek(0);
}
FLOG( FLog::Controller << "\tFinished (@Upward Pass (M2M) = " << counterTime.tacAndElapsed() << " s)\n" );
FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
}
/** M2M */ /** M2M */
void upwardPass(){ void upwardPass(){
const int MaxSizePerCell = CellClass::GetSize(); const int MaxSizePerCell = CellClass::GetSize();
...@@ -493,16 +318,16 @@ private: ...@@ -493,16 +318,16 @@ private:
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// The proc to send the shared cells to (will move to left) // The proc to send the shared cells to (will move to left)
int currentProcIdToSendTo = idProcess; int currentProcIdToSendTo = (idProcess - 1);
// There are a maximum of 8-1 sends and 8-1 receptions // There are a maximum of 1 sends and 8-1 receptions
MPI_Request requests[14]; MPI_Request requests[8];
MPI_Status status[14]; MPI_Status status[8];
// Maximum data per message is: // Maximum data per message is:
FMpiBufferWriter sendBuffer(comm.getComm(),7*MaxSizePerCell); FMpiBufferWriter sendBuffer(comm.getComm(), 7*MaxSizePerCell + 1);
const int recvBufferOffset = (8 * MaxSizePerCell + 1); const int recvBufferOffset = (7 * MaxSizePerCell + 1);
FMpiBufferReader recvBuffer(comm.getComm(), nbProcess*recvBufferOffset); FMpiBufferReader recvBuffer(comm.getComm(), 7*recvBufferOffset);
CellClass recvBufferCells[8]; CellClass recvBufferCells[8];
// The first proc that send to me a cell // The first proc that send to me a cell
...@@ -511,9 +336,9 @@ private: ...@@ -511,9 +336,9 @@ private:
for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){ for(int idxLevel = OctreeHeight - 2 ; idxLevel > 1 ; --idxLevel ){
// Does my cells are covered by my neighbors working interval // Does my cells are covered by my neighbors working interval
const bool noMoreWorkForMe = (idProcess != 0 && getWorkingInterval((idxLevel+1), idProcess).rightIndex <= getWorkingInterval((idxLevel+1), idProcess - 1).rightIndex); const bool noMoreWorkForMe = (idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess));
if(noMoreWorkForMe){ if(noMoreWorkForMe){
FAssertLF(getWorkingInterval(idxLevel, idProcess).rightIndex < getWorkingInterval(idxLevel, idProcess).leftIndex); FAssertLF(procHasWorkAtLevel(idxLevel, idProcess) == false);
break; break;
} }
...@@ -527,34 +352,27 @@ private: ...@@ -527,34 +352,27 @@ private:
int iterMpiRequests = 0; // The iterator for send/recv requests int iterMpiRequests = 0; // The iterator for send/recv requests
int nbCellsToSend = 0; // The number of cells to send int nbCellsToSkip = 0; // The number of cells to send
// Skip all the cells that are out of my working interval // Skip all the cells that are out of my working interval
while(iterArray[nbCellsToSend].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){ while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel, idProcess).leftIndex){
++nbCellsToSend; ++nbCellsToSkip;
} }
int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1 int nbCellsForThreads = totalNbCellsAtLevel; // totalNbCellsAtLevel or totalNbCellsAtLevel-1
bool hasToReceive = false; bool hasToReceive = false;
if(idProcess != nbProcess-1){ if(idProcess != nbProcess-1 && procHasWorkAtLevel(idxLevel , idProcess)){
// Find the first proc that may send to me // Find the first proc that may send to me
while(firstProcThatSend < nbProcess while(firstProcThatSend < nbProcess && !procHasWorkAtLevel(idxLevel+1, firstProcThatSend) ){
&& (getWorkingInterval((idxLevel+1), firstProcThatSend).rightIndex) <= (getWorkingInterval((idxLevel+1), idProcess).rightIndex)){
firstProcThatSend += 1; firstProcThatSend += 1;
} }
// Do we have to receive? // Do we have to receive?
if(firstProcThatSend < nbProcess && (getWorkingInterval((idxLevel+1), firstProcThatSend).leftIndex >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3) ){ if(firstProcThatSend < nbProcess && procHasWorkAtLevel(idxLevel+1, firstProcThatSend) && procCoversMyRightBorderCell(idxLevel, firstProcThatSend) ){
hasToReceive = true; hasToReceive = true;
// Threads do not compute the last cell, we will do it once data are received // Threads do not compute the last cell, we will do it once data are received
nbCellsForThreads -= 1; nbCellsForThreads -= 1;
} }
} }
// TODO REMOVE
//if((iterArray[0].getCurrentGlobalIndex() == getWorkingInterval(idxLevel, idProcess-1).rightIndex) && numberOfCells==1){
// nbCellsToSend++;
//}
// END REMOVE
FLOG(parallelCounter.tic()); FLOG(parallelCounter.tic());
#pragma omp parallel #pragma omp parallel
{ {
...@@ -564,95 +382,92 @@ private: ...@@ -564,95 +382,92 @@ private:
#pragma omp single nowait #pragma omp single nowait
{ {
FLOG(singleCounter.tic()); FLOG(singleCounter.tic());
const bool firstCellAreOutOfMyInterval = idProcess != 0 && (getWorkingInterval((idxLevel+1), idProcess).leftIndex >>3) <= (getWorkingInterval((idxLevel+1), idProcess - 1).rightIndex >>3); if(idProcess != 0){
//Post Send while( currentProcIdToSendTo && !procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) ){
if(firstCellAreOutOfMyInterval){ --currentProcIdToSendTo;
FAssertLF(nbCellsToSend != 0);
char packageFlags = 0;
sendBuffer.write(packageFlags);
// Only the cell the most on the right out of my working interval should be taken in
// consideration (at pos nbCellsToSend-1) other (x < nbCellsToSend-1) have already been sent
const CellClass* const* const child = iterArray[nbCellsToSend-1].getCurrentChild();
for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
// Check if child exists and it was part of my working interval
if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
// Add the cell to the buffer
child[idxChild]->serializeUp(sendBuffer);
packageFlags = char(packageFlags | (0x1 << idxChild));
}
} }
// Add the flag as first value
sendBuffer.writeAt(0,packageFlags);
// Find the proc to send to if(procHasWorkAtLevel(idxLevel, currentProcIdToSendTo) && procCoversMyLeftBorderCell(idxLevel, currentProcIdToSendTo)){
while( currentProcIdToSendTo && iterArray[nbCellsToSend-1].getCurrentGlobalIndex() <= getWorkingInterval(idxLevel , currentProcIdToSendTo - 1).rightIndex){ FAssertLF(nbCellsToSkip != 0);
--currentProcIdToSendTo;
char packageFlags = 0;
sendBuffer.write(packageFlags);
// Only the cell the most on the right out of my working interval should be taken in
// consideration (at pos nbCellsToSkip-1) other (x < nbCellsToSkip-1) have already been sent
const CellClass* const* const child = iterArray[nbCellsToSkip-1].getCurrentChild();
for(int idxChild = 0 ; idxChild < 8 ; ++idxChild){
// Check if child exists and it was part of my working interval
if( child[idxChild] && getWorkingInterval((idxLevel+1), idProcess).leftIndex <= child[idxChild]->getMortonIndex() ){
// Add the cell to the buffer
child[idxChild]->serializeUp(sendBuffer);
packageFlags = char(packageFlags | (0x1 << idxChild));
}
}
// Add the flag as first value
sendBuffer.writeAt(0,packageFlags);
// Post the message
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, currentProcIdToSendTo,
FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
} }
// Post the message
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, currentProcIdToSendTo,
FMpi::TagFmmM2M, comm.getComm(), &requests[iterMpiRequests++]);
} }
//Post receive, Datas needed in several parts of the section //Post receive, Datas needed in several parts of the section
int nbProcThatSendToMe = 0; int nbProcThatSendToMe = 0;
if(hasToReceive){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M if(hasToReceive){ // if I'm the last one (idProcess == nbProcess-1), I shall not receive anything in a M2M
//Test : if the firstProcThatSend father minimal value in interval is lesser than mine //Test : if the firstProcThatSend father minimal value in interval is lesser than mine
int endProcThatSend = firstProcThatSend; int idProcSource = firstProcThatSend;
// Find the last proc that should send to me // Find the last proc that should send to me
while( endProcThatSend < nbProcess && while( idProcSource < nbProcess
(getWorkingInterval((idxLevel+1) ,endProcThatSend).leftIndex >>3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex>>3)){ && ( !procHasWorkAtLevel(idxLevel+1, idProcSource) || procCoversMyRightBorderCell(idxLevel, idProcSource) )){
++endProcThatSend; if(procHasWorkAtLevel(idxLevel+1, idProcSource) && procCoversMyRightBorderCell(idxLevel, idProcSource)){
} MPI_Irecv(&recvBuffer.data()[nbProcThatSendToMe * recvBufferOffset], recvBufferOffset, MPI_PACKED,
idProcSource, FMpi::TagFmmM2M + idxLevel, comm.getComm(), &requests[iterMpiRequests++]);
// If there is at least one proc that send me something nbProcThatSendToMe += 1;
if(firstProcThatSend != endProcThatSend){ FAssertLF(nbProcThatSendToMe <= 7);
nbProcThatSendToMe = endProcThatSend - firstProcThatSend;
FAssertLF(nbProcThatSendToMe <= 7);
// Post a recv for each of them
for(int idxProc = firstProcThatSend ; idxProc < endProcThatSend ; ++idxProc ){
MPI_Irecv(&recvBuffer.data()[idxProc * recvBufferOffset], recvBufferOffset, MPI_PACKED,
idxProc, FMpi::TagFmmM2M, comm.getComm(), &requests[iterMpiRequests++]);
} }
++idProcSource;
} }
} }
//Wait For the comms, and do the work //Wait For the comms, and do the work
// Are we sending or waiting anything? // Are we sending or waiting anything?
if(iterMpiRequests){ if(iterMpiRequests){
FAssertLF(iterMpiRequests <= 8);
MPI_Waitall( iterMpiRequests, requests, status); MPI_Waitall( iterMpiRequests, requests, status);
// we were receiving data }
if( hasToReceive ){
if( hasToReceive ){ FAssertLF(iterMpiRequests != 0);
CellClass* currentChild[8]; CellClass* currentChild[8];
memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*)); memcpy(currentChild, iterArray[totalNbCellsAtLevel - 1].getCurrentChild(), 8 * sizeof(CellClass*));
// Retreive data and merge my child and the child from others // Retreive data and merge my child and the child from others
for(int idxProc = firstProcThatSend ; idxProc < firstProcThatSend + nbProcThatSendToMe ; ++idxProc){ for(int idxProc = 0 ; idxProc < nbProcThatSendToMe ; ++idxProc){
recvBuffer.seek(idxProc * recvBufferOffset); recvBuffer.seek(idxProc * recvBufferOffset);
int state = int(recvBuffer.getValue<char>()); int state = int(recvBuffer.getValue<char>());
int position = 0; int position = 0;
while( state && position < 8){ while( state && position < 8){
while(!(state & 0x1)){ while(!(state & 0x1)){
state >>= 1;
++position;
}
FAssertLF(!currentChild[position], "Already has a cell here");
recvBufferCells[position].deserializeUp(recvBuffer);
currentChild[position] = (CellClass*) &recvBufferCells[position];
state >>= 1; state >>= 1;
++position; ++position;
} }
}
// Finally compute FAssertLF(!currentChild[position], "Already has a cell here");
(*kernels[threadNumber]).M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel); recvBufferCells[position].deserializeUp(recvBuffer);
firstProcThatSend += nbProcThatSendToMe - 1; currentChild[position] = (CellClass*) &recvBufferCells[position];
state >>= 1;
++position;
}
} }
// Finally compute
(*kernels[threadNumber]).M2M( iterArray[totalNbCellsAtLevel - 1].getCurrentCell() , currentChild, idxLevel);
firstProcThatSend += nbProcThatSendToMe - 1;
} }
// Reset buffer
sendBuffer.reset(); sendBuffer.reset();
recvBuffer.seek(0); recvBuffer.seek(0);
FLOG(singleCounter.tac()); FLOG(singleCounter.tac());
...@@ -660,7 +475,7 @@ private: ...@@ -660,7 +475,7 @@ private:
// All threads proceed the M2M // All threads proceed the M2M
#pragma omp for nowait #pragma omp for nowait
for( int idxCell = nbCellsToSend ; idxCell < nbCellsForThreads ; ++idxCell){ for( int idxCell = nbCellsToSkip ; idxCell < nbCellsForThreads ; ++idxCell){
myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); myThreadkernels->M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
} }
}//End of parallel section }//End of parallel section
...@@ -680,7 +495,7 @@ private: ...@@ -680,7 +495,7 @@ private:
///////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////
void transferPassOld(){ void transferPass(){
const int MaxSizePerCell = CellClass::GetSize(); const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) ); FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
...@@ -718,11 +533,9 @@ private: ...@@ -718,11 +533,9 @@ private:
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// for each levels // for each levels
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
if(idProcess != 0 if(!procHasWorkAtLevel(idxLevel, idProcess)){
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveDown(); avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator; octreeIterator = avoidGotoLeftIterator;
continue; continue;
} }
...@@ -742,7 +555,6 @@ private: ...@@ -742,7 +555,6 @@ private:
leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells); leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);
// Which cell potentialy needs other data and in the same time // Which cell potentialy needs other data and in the same time
// are potentialy needed by other // are potentialy needed by other
MortonIndex neighborsIndexes[189]; MortonIndex neighborsIndexes[189];
...@@ -780,9 +592,7 @@ private: ...@@ -780,9 +592,7 @@ private:
if(needOther){ if(needOther){
leafsNeedOther[idxLevel]->set(idxCell,true); leafsNeedOther[idxLevel]->set(idxCell,true);
} }
} }
} }
FLOG(prepareCounter.tac()); FLOG(prepareCounter.tac());
...@@ -801,7 +611,6 @@ private: ...@@ -801,7 +611,6 @@ private:
FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, comm.getComm()), __LINE__ ); FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, comm.getComm()), __LINE__ );
FLOG(gatherCounter.tac()); FLOG(gatherCounter.tac());
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// Send and receive for real // Send and receive for real
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
...@@ -863,12 +672,9 @@ private: ...@@ -863,12 +672,9 @@ private:
// Now we can compute all the data // Now we can compute all the data
// for each levels // for each levels
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
if(idProcess != 0 if(!procHasWorkAtLevel(idxLevel, idProcess)){
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveDown(); avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator; octreeIterator = avoidGotoLeftIterator;
continue; continue;
} }
...@@ -918,12 +724,9 @@ private: ...@@ -918,12 +724,9 @@ private:
// compute the second time // compute the second time
// for each levels // for each levels
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){ for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
if(idProcess != 0 if(!procHasWorkAtLevel(idxLevel, idProcess)){
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveDown(); avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator; octreeIterator = avoidGotoLeftIterator;
continue; continue;
} }
...@@ -995,10 +798,6 @@ private: ...@@ -995,10 +798,6 @@ private:
} }
} }
} }
// FILE * fd = fopen("res0","a+");
// fprintf(fd,"Q %d\t %d\t %d\t %d\n",
// idProcess,idxLevel,idxCell,counter);
// fclose(fd);
// need to compute // need to compute
if(counter){ if(counter){
myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel); myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
...@@ -1037,410 +836,11 @@ private: ...@@ -1037,410 +836,11 @@ private:
} }
/** M2L */
void transferPass(){
const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Downward Pass (M2L)\n").write(FLog::Flush); );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
FLOG(FTic singleCounter);
FLOG(FTic gatherCounter);
FLOG(FTic m2lSelf);
FLOG(FTic m2lFar);
FLOG(FTic sendCounter);
//////////////////////////////////////////////////////////////////
// First know what to send to who
//////////////////////////////////////////////////////////////////
// pointer to send
FVector<typename OctreeClass::Iterator> toSend[nbProcess * OctreeHeight];
// index
int*const indexToSend = new int[nbProcess * OctreeHeight];
memset(indexToSend, 0, sizeof(int) * nbProcess * OctreeHeight);
// To know which one has need someone
FBoolArray** const leafsNeedOther = new FBoolArray*[OctreeHeight];
memset(leafsNeedOther, 0, sizeof(FBoolArray*) * OctreeHeight);
//Variables needed in multiple omp sections
int iterRequest;
MPI_Request * requests;
MPI_Status * status;
int* globalReceiveMap;
FMpiBufferWriter** sendBuffer;
FMpiBufferReader** recvBuffer;
// #pragma omp parallel /*default(none)*/ shared(m2lSelf,globalReceiveMap,requests,sendBuffer,recvBuffer,iterRequest,status,singleCounter,toSend,gatherCounter,sendCounter)
#pragma omp parallel /*default(none)*/ shared(globalReceiveMap,requests,sendBuffer,recvBuffer,iterRequest,status,toSend) FLOG(shared(m2lSelf,singleCounter,gatherCounter,sendCounter))
{
int threadIdx = omp_get_thread_num();
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.gotoBottomLeft();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
#pragma omp single //nowait
{
FTRACE( FTrace::FRegion regionTrace( "Preprocess" , __FUNCTION__ , __FILE__ , __LINE__) );
FLOG(singleCounter.tic());
typename OctreeClass::Iterator octreeIteratorComm(tree);
octreeIteratorComm.moveDown();
typename OctreeClass::Iterator avoidGotoLeftIteratorComm(octreeIteratorComm);
// To know if a leaf has been already sent to a proc
bool*const alreadySent = new bool[nbProcess];
memset(alreadySent, 0, sizeof(bool) * nbProcess);
// for each levels
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
//If I'm not the process 0 AND my rightIndex idx is lesser than left one's
if(idProcess != 0
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIteratorComm.moveDown();
octreeIteratorComm = avoidGotoLeftIteratorComm;
continue;
}
int numberOfCells = 0;
while(octreeIteratorComm.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
octreeIteratorComm.moveRight();
}
// for each cells
do{
iterArrayComm[numberOfCells] = octreeIteratorComm;
++numberOfCells;
} while(octreeIteratorComm.moveRight());
avoidGotoLeftIteratorComm.moveDown();
octreeIteratorComm = avoidGotoLeftIteratorComm;
//printf("Single :: I'm %d, numberOfCells : %d my iterArrayComm %p \n",omp_get_thread_num(),numberOfCells,iterArrayComm);
leafsNeedOther[idxLevel] = new FBoolArray(numberOfCells);
// Which cell potentialy needs other data and in the same time
// are potentialy needed by other
MortonIndex neighborsIndexes[189];
for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
// Find the M2L neigbors of a cell
const int counter = iterArrayComm[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndexes);
memset(alreadySent, false, sizeof(bool) * nbProcess);
bool needOther = false;
// Test each negibors to know which one do not belong to us
for(int idxNeigh = 0 ; idxNeigh < counter ; ++idxNeigh){
if(neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , idProcess).leftIndex
|| (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndexes[idxNeigh]){
int procToReceive = idProcess;
while( 0 != procToReceive && neighborsIndexes[idxNeigh] < getWorkingInterval(idxLevel , procToReceive).leftIndex ){
--procToReceive;
}
while( procToReceive != nbProcess -1 && (getWorkingInterval(idxLevel , procToReceive).rightIndex) < neighborsIndexes[idxNeigh]){
++procToReceive;
}
// Maybe already sent to that proc?
if( !alreadySent[procToReceive]
&& getWorkingInterval(idxLevel , procToReceive).leftIndex <= neighborsIndexes[idxNeigh]
&& neighborsIndexes[idxNeigh] <= getWorkingInterval(idxLevel , procToReceive).rightIndex){
alreadySent[procToReceive] = true;
needOther = true;
toSend[idxLevel * nbProcess + procToReceive].push(iterArray[idxCell]);
++indexToSend[idxLevel * nbProcess + procToReceive];
}
}
}
if(needOther){
leafsNeedOther[idxLevel]->set(idxCell,true);
}
}
}
delete[] alreadySent;
//////////////////////////////////////////////////////////////////
// Gather this information
//////////////////////////////////////////////////////////////////
FLOG(gatherCounter.tic());
// All process say to each others
// what the will send to who
globalReceiveMap = new int[nbProcess * nbProcess * OctreeHeight];
memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess * OctreeHeight);
FMpi::MpiAssert( MPI_Allgather( indexToSend, nbProcess * OctreeHeight, MPI_INT, globalReceiveMap, nbProcess * OctreeHeight, MPI_INT, comm.getComm()), __LINE__ );
FLOG(gatherCounter.tac());
// if(idProcess == 0)
// {
// FILE * fd = fopen("M2LMap","a+");
// for(int k =0; k < nbProcess*nbProcess*OctreeHeight ; ++k){
// fprintf(fd,"%d\n",globalReceiveMap[k]);
// }
// fclose(fd);
// }
//////////////////////////////////////////////////////////////////
// Send and receive for real
//////////////////////////////////////////////////////////////////
FLOG(sendCounter.tic());
// Then they can send and receive (because they know what they will receive)
// To send in asynchrone way
requests = new MPI_Request[2 * nbProcess * OctreeHeight];
status = new MPI_Status[2 * nbProcess * OctreeHeight];
iterRequest = 0;
const int SizeOfCellToSend = int(sizeof(MortonIndex) + sizeof(int) + MaxSizePerCell);
sendBuffer = new FMpiBufferWriter*[nbProcess * OctreeHeight];
memset(sendBuffer, 0, sizeof(FMpiBufferWriter*) * nbProcess * OctreeHeight);
recvBuffer = new FMpiBufferReader*[nbProcess * OctreeHeight];
memset(recvBuffer, 0, sizeof(FMpiBufferReader*) * nbProcess * OctreeHeight);
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
const int toSendAtProcAtLevel = indexToSend[idxLevel * nbProcess + idxProc];
if(toSendAtProcAtLevel != 0){
sendBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferWriter(comm.getComm(),toSendAtProcAtLevel * SizeOfCellToSend);
for(int idxLeaf = 0 ; idxLeaf < toSendAtProcAtLevel; ++idxLeaf){
const MortonIndex cellIndex = toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentGlobalIndex();
sendBuffer[idxLevel * nbProcess + idxProc]->write(cellIndex);
toSend[idxLevel * nbProcess + idxProc][idxLeaf].getCurrentCell()->serializeUp(*sendBuffer[idxLevel * nbProcess + idxProc]);
}
FMpi::MpiAssert( MPI_Isend( sendBuffer[idxLevel * nbProcess + idxProc]->data(),
sendBuffer[idxLevel * nbProcess + idxProc]->getSize(),MPI_PACKED, idxProc,
FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
}
const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
if(toReceiveFromProcAtLevel){
recvBuffer[idxLevel * nbProcess + idxProc] = new FMpiBufferReader(comm.getComm(),toReceiveFromProcAtLevel * SizeOfCellToSend);
FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxLevel * nbProcess + idxProc]->data(),
recvBuffer[idxLevel * nbProcess + idxProc]->getCapacity(), MPI_PACKED,idxProc,
FMpi::TagLast + idxLevel, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
}
}
}
FLOG(sendCounter.tac());
{
//////////////////////////////////////////////////////////////////
// Wait received data and compute
//////////////////////////////////////////////////////////////////
// Wait to receive every things (and send every things)
MPI_Waitall(iterRequest, requests, status);
}
FLOG(singleCounter.tac());
}
//End of Single section
//////////////////////////////////////////////////////////////////
// Do M2L SELF
//////////////////////////////////////////////////////////////////
FLOG(m2lSelf.tic());
{
FTRACE( FTrace::FRegion regionTrace("Compute", __FUNCTION__ , __FILE__ , __LINE__) );
// Now we can compute all the data
// for each levels
for(int idxLevel = OctreeHeight-1 ; idxLevel > 1 ; --idxLevel ){
if(idProcess != 0
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveUp();
octreeIterator = avoidGotoLeftIterator;
continue;
}
int numberOfCells = 0;
// for each cells
//TODO see if it can be moved somewhere else
#pragma omp single copyprivate(numberOfCells,octreeIterator,avoidGotoLeftIterator)
{
while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
octreeIterator.moveRight();
}
do{
iterArray[numberOfCells] = octreeIterator;
++numberOfCells;
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveUp();
octreeIterator = avoidGotoLeftIterator;
}//End of single section
FLOG(computationCounter.tic());
{
KernelClass * const myThreadkernels = kernels[threadIdx];
const CellClass* neighbors[343];
//Change kind
#pragma omp for schedule(static,1) //nowait
for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
const int counter = tree->getInteractionNeighbors(neighbors, iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel);
if(counter){
myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
}
}
myThreadkernels->finishedLevelM2L(idxLevel);
}
FLOG(computationCounter.tac());
}
}
FLOG(m2lSelf.tac());
}//End of parallel section
//Synch between threads at implict barrier
FTRACE( FTrace::FRegion regionTrace("Compute Received data", __FUNCTION__ , __FILE__ , __LINE__) );
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.moveDown();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
// compute the second time
// for each levels
for(int idxLevel = 2 ; idxLevel < OctreeHeight ; ++idxLevel ){
if(idProcess != 0
&& getWorkingInterval(idxLevel, idProcess).rightIndex <= getWorkingInterval(idxLevel, idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator;
continue;
}
// put the received data into a temporary tree
FLightOctree<CellClass> tempTree;
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
const int toReceiveFromProcAtLevel = globalReceiveMap[(idxProc * nbProcess * OctreeHeight) + idxLevel * nbProcess + idProcess];
for(int idxCell = 0 ; idxCell < toReceiveFromProcAtLevel ; ++idxCell){
const MortonIndex cellIndex = recvBuffer[idxLevel * nbProcess + idxProc]->FMpiBufferReader::getValue<MortonIndex>();
CellClass* const newCell = new CellClass;
newCell->setMortonIndex(cellIndex);
newCell->deserializeUp(*recvBuffer[idxLevel * nbProcess + idxProc]);
tempTree.insertCell(cellIndex, idxLevel, newCell);
}
}
// take cells from our octree only if they are
// linked to received data
int numberOfCells = 0;
int realCellId = 0;
while(octreeIterator.getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
octreeIterator.moveRight();
}
// for each cells
do{
// copy cells that need data from others
if(leafsNeedOther[idxLevel]->get(realCellId++)){
iterArray[numberOfCells++] = octreeIterator;
}
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator;
printf("Proc :: %d Lvl %d, numberOfCells : %d that need other datas \n",idProcess,idxLevel,numberOfCells);
delete leafsNeedOther[idxLevel];
leafsNeedOther[idxLevel] = nullptr;
// Compute this cells
FLOG(computationCounter.tic());
FLOG(m2lFar.tic());
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
MortonIndex neighborsIndex[189];
int neighborsPosition[189];
const CellClass* neighbors[343];
printf("Proc %d, idThread %d, idxLevel %d, number : %d \n",
idProcess,omp_get_thread_num(),idxLevel,numberOfCells);
#pragma omp for schedule(dynamic) //nowait
for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
// compute indexes
memset(neighbors, 0, 343 * sizeof(CellClass*));
memset(neighborsPosition, 0, 189 * sizeof(int));
memset(neighborsIndex, 0, 189 * sizeof(MortonIndex));
const int counterNeighbors = iterArray[idxCell].getCurrentGlobalCoordinate().getInteractionNeighbors(idxLevel, neighborsIndex, neighborsPosition);
int counter = 0;
// does we receive this index from someone?
for(int idxNeig = 0 ;idxNeig < counterNeighbors ; ++idxNeig){
if(neighborsIndex[idxNeig] < (getWorkingInterval(idxLevel , idProcess).leftIndex)
|| (getWorkingInterval(idxLevel , idProcess).rightIndex) < neighborsIndex[idxNeig]){
CellClass*const otherCell = tempTree.getCell(neighborsIndex[idxNeig], idxLevel);
if(otherCell){
//otherCell->setMortonIndex(neighborsIndex[idxNeig]);
neighbors[ neighborsPosition[idxNeig] ] = otherCell;
++counter;
}
}
}
// FILE * fd = fopen("res1","a+");
// fprintf(fd,"Q %d\t %d\t %d\t %d\n",
// idProcess,idxLevel,idxCell,counter);
// fclose(fd);
// need to compute
if(counter){
myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
}
}
myThreadkernels->finishedLevelM2L(idxLevel);
}//End of parallel section
FLOG(computationCounter.tac());
FLOG(m2lFar.tac());
}
for(int idxComm = 0 ; idxComm < nbProcess * OctreeHeight; ++idxComm){
delete sendBuffer[idxComm];
delete recvBuffer[idxComm];
}
for(int idxComm = 0 ; idxComm < OctreeHeight; ++idxComm){
delete leafsNeedOther[idxComm];
}
delete[] sendBuffer;
delete[] recvBuffer;
delete[] indexToSend;
delete[] leafsNeedOther;
delete[] globalReceiveMap;
delete[] requests;
delete[] status;
FLOG( FLog::Controller << "\tFinished (@Downward Pass (M2L) = " << counterTime.tacAndElapsed() << " s)\n" );
FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t Single : " << singleCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t M2L Self : " << m2lSelf.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t M2L Far : " << m2lFar.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t M2L Gather : " << gatherCounter.elapsed() << " s\n" );
FLOG( FLog::Controller << "\t\t M2L Send : " << sendCounter.elapsed() << " s\n" );
}
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
// ---------------- L2L --------------- // ---------------- L2L ---------------
////////////////////////////////////////////////////////////////// //////////////////////////////////////////////////////////////////
void downardPassOld(){ // second L2L void downardPass(){ // second L2L
const int MaxSizePerCell = CellClass::GetSize(); const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) ); FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); ); FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
...@@ -1454,263 +854,105 @@ private: ...@@ -1454,263 +854,105 @@ private:
octreeIterator.moveDown(); octreeIterator.moveDown();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator); typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
MPI_Request*const requests = new MPI_Request[nbProcess]; // Max 1 receive and 7 send (but 7 times the same data)
MPI_Status*const status = new MPI_Status[nbProcess]; MPI_Request*const requests = new MPI_Request[8];
MPI_Status*const status = new MPI_Status[8];
const int heightMinusOne = OctreeHeight - 1; const int heightMinusOne = OctreeHeight - 1;
FMpiBufferWriter sendBuffer(comm.getComm(),MaxSizePerCell); FMpiBufferWriter sendBuffer(comm.getComm(),MaxSizePerCell);
FMpiBufferReader recvBuffer(comm.getComm(),MaxSizePerCell); FMpiBufferReader recvBuffer(comm.getComm(),MaxSizePerCell);
int righestProcToSendTo = nbProcess - 1;
// for each levels exepted leaf level // for each levels exepted leaf level
for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){ for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
if(idProcess != 0 // If nothing to do in the next level skip the current one
&& getWorkingInterval((idxLevel+1) , idProcess).rightIndex <= getWorkingInterval((idxLevel+1) , idProcess - 1).rightIndex){ if(idProcess != 0 && !procHasWorkAtLevel(idxLevel+1, idProcess) ){
avoidGotoLeftIterator.moveDown(); avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator; octreeIterator = avoidGotoLeftIterator;
continue; continue;
} }
// copy cells to work with int totalNbCellsAtLevel = 0;
int numberOfCells = 0;
// for each cells
do{ do{
iterArray[numberOfCells++] = octreeIterator; iterArray[totalNbCellsAtLevel++] = octreeIterator;
} while(octreeIterator.moveRight()); } while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveDown(); avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator; octreeIterator = avoidGotoLeftIterator;
int firstCellWork = -1; int nbCellsToSkip = 0;
while(iterArray[firstCellWork+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){ while(nbCellsToSkip < totalNbCellsAtLevel && iterArray[nbCellsToSkip].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
++firstCellWork; nbCellsToSkip += 1;
} }
bool needToRecv = false; bool hasToReceive = false;
int iterRequests = 0; int idxProcToReceive = idProcess - 1;
if(idProcess != 0 && nbCellsToSkip){
FLOG(prepareCounter.tic()); while(idxProcToReceive && !procHasWorkAtLevel(idxLevel, idxProcToReceive) ){
idxProcToReceive -= 1;
// do we need to receive one or zeros cell
if(idProcess != 0
&& (getWorkingInterval((idxLevel + 1) , idProcess).leftIndex >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).rightIndex >> 3 ) ){
needToRecv = true;
MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, MPI_ANY_SOURCE,
FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]);
}
if(idProcess != nbProcess - 1){
int firstProcThatRecv = idProcess + 1;
while( firstProcThatRecv < nbProcess &&
getWorkingInterval((idxLevel + 1) , firstProcThatRecv).rightIndex <= getWorkingInterval((idxLevel+1) , idProcess).rightIndex){
++firstProcThatRecv;
}
int endProcThatRecv = firstProcThatRecv;
while( endProcThatRecv < nbProcess &&
(getWorkingInterval((idxLevel + 1) , endProcThatRecv).leftIndex >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex >> 3) ){
++endProcThatRecv;
}
if(firstProcThatRecv != endProcThatRecv){
iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer);
for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProc,
FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]);
}
}
}
FLOG(prepareCounter.tac());
FLOG(computationCounter.tic());
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for nowait
for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){
myThreadkernels.L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
} }
} if(procHasWorkAtLevel(idxLevel, idxProcToReceive) && procCoversMyLeftBorderCell(idxLevel, idxProcToReceive)){
FLOG(computationCounter.tac()); hasToReceive = true;
// are we sending or receiving?
if(iterRequests){
// process
FLOG(waitCounter.tic());
MPI_Waitall( iterRequests, requests, status);
FLOG(waitCounter.tac());
if(needToRecv){
// Need to compute
FLOG(computationCounter.tic());
iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer);
kernels[0]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel);
FLOG(computationCounter.tac());
} }
} }
sendBuffer.reset(); #pragma omp parallel
recvBuffer.seek(0);
}
delete[] requests;
delete[] status;
FLOG( FLog::Controller << "\tFinished (@Downward Pass (L2L) = " << counterTime.tacAndElapsed() << " s)\n" );
FLOG( FLog::Controller << "\t\t Computation : " << computationCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t Prepare : " << prepareCounter.cumulated() << " s\n" );
FLOG( FLog::Controller << "\t\t Wait : " << waitCounter.cumulated() << " s\n" );
}
void downardPass(){ // second L2L
const int MaxSizePerCell = CellClass::GetSize();
FTRACE( FTrace::FFunction functionTrace(__FUNCTION__, "Fmm" , __FILE__ , __LINE__) );
FLOG( FLog::Controller.write("\tStart Downward Pass (L2L)\n").write(FLog::Flush); );
FLOG(FTic counterTime);
FLOG(FTic computationCounter);
FLOG(FTic prepareCounter);
FLOG(FTic waitCounter);
// Start from leal level - 1
typename OctreeClass::Iterator octreeIterator(tree);
octreeIterator.moveDown();
typename OctreeClass::Iterator avoidGotoLeftIterator(octreeIterator);
MPI_Request*const requests = new MPI_Request[nbProcess];
MPI_Status*const status = new MPI_Status[nbProcess];
const int heightMinusOne = OctreeHeight - 1;
FMpiBufferWriter sendBuffer(comm.getComm(),MaxSizePerCell);
FMpiBufferReader recvBuffer(comm.getComm(),MaxSizePerCell);
// for each levels exepted leaf level
for(int idxLevel = 2 ; idxLevel < heightMinusOne ; ++idxLevel ){
if(idProcess != 0
&& getWorkingInterval((idxLevel+1) , idProcess).rightIndex <= getWorkingInterval((idxLevel+1) , idProcess - 1).rightIndex){
avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator;
continue;
}
// copy cells to work with
int numberOfCells = 0;
// for each cells
do{
iterArray[numberOfCells++] = octreeIterator;
} while(octreeIterator.moveRight());
avoidGotoLeftIterator.moveDown();
octreeIterator = avoidGotoLeftIterator;
int firstCellWork = -1;
while(iterArray[firstCellWork+1].getCurrentGlobalIndex() < getWorkingInterval(idxLevel , idProcess).leftIndex){
++firstCellWork;
}
if(iterArray[firstCellWork+1].getCurrentGlobalIndex() == getWorkingInterval(idxLevel , idProcess).leftIndex && numberOfCells==1){
printf("You Mouchard\n");
firstCellWork++;
}
#pragma omp parallel
{ {
int threadNumber = omp_get_thread_num(); int threadNumber = omp_get_thread_num();
KernelClass* myThreadkernels = (kernels[threadNumber]); KernelClass* myThreadkernels = (kernels[threadNumber]);
#pragma omp single #pragma omp single
{ {
bool needToRecv = false;
int iterRequests = 0;
FLOG(prepareCounter.tic()); FLOG(prepareCounter.tic());
int iterRequests = 0;
// do we need to receive one or zeros cell if(hasToReceive){
if(idProcess != 0 FMpi::MpiAssert( MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, idxProcToReceive/*MPI_ANY_SOURCE*/,
&& (getWorkingInterval((idxLevel + 1) , idProcess).leftIndex >> 3 ) <= (getWorkingInterval((idxLevel+1) , idProcess - 1).rightIndex >> 3 ) ){ FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__ );
needToRecv = true;
MPI_Irecv( recvBuffer.data(), recvBuffer.getCapacity(), MPI_PACKED, MPI_ANY_SOURCE,
FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]);
} }
if(idProcess != nbProcess - 1 && idProcess < righestProcToSendTo){
if(idProcess != nbProcess - 1){ int idxProcSend = idProcess + 1;
int firstProcThatRecv = idProcess + 1; int nbMessageSent = 0;
while( firstProcThatRecv < nbProcess && while(idxProcSend <= righestProcToSendTo && ( !procHasWorkAtLevel(idxLevel+1, idxProcSend) || procCoversMyRightBorderCell(idxLevel, idxProcSend)) ){
getWorkingInterval((idxLevel + 1) , firstProcThatRecv).rightIndex <= getWorkingInterval((idxLevel+1) , idProcess).rightIndex){ if(procHasWorkAtLevel(idxLevel+1, idxProcSend)){
++firstProcThatRecv; if( nbMessageSent == 0 ){
} iterArray[totalNbCellsAtLevel - 1].getCurrentCell()->serializeDown(sendBuffer);
}
int endProcThatRecv = firstProcThatRecv; FMpi::MpiAssert( MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProcSend,
while( endProcThatRecv < nbProcess && FMpi::TagFmmL2L + idxLevel, comm.getComm(), &requests[iterRequests++]), __LINE__);
(getWorkingInterval((idxLevel + 1) , endProcThatRecv).leftIndex >> 3) <= (getWorkingInterval((idxLevel+1) , idProcess).rightIndex >> 3) ){ nbMessageSent += 1;
++endProcThatRecv; FAssertLF(nbMessageSent <= 7);
}
if(firstProcThatRecv != endProcThatRecv){
iterArray[numberOfCells - 1].getCurrentCell()->serializeDown(sendBuffer);
for(int idxProc = firstProcThatRecv ; idxProc < endProcThatRecv ; ++idxProc ){
MPI_Isend(sendBuffer.data(), sendBuffer.getSize(), MPI_PACKED, idxProc,
FMpi::TagFmmL2L, comm.getComm(), &requests[iterRequests++]);
} }
idxProcSend += 1;
} }
righestProcToSendTo = idxProcSend;
} }
// are we sending or receiving?
if(iterRequests){ if(iterRequests){
// process
FLOG(waitCounter.tic()); FLOG(waitCounter.tic());
MPI_Waitall( iterRequests, requests, status); FAssertLF(iterRequests <= 8);
FMpi::MpiAssert(MPI_Waitall( iterRequests, requests, status), __LINE__);
FLOG(waitCounter.tac()); FLOG(waitCounter.tac());
if(needToRecv){
// if(idProcess == 53){
// fprintf(stderr,"You found One : %d, nbofCells %d Intervals : %d{%lld,%lld} %d{%lld,%lld} Left : %d{%lld,%lld} %d{%lld,%lld} %lld\n",
// idProcess,numberOfCells,
// idxLevel, getWorkingInterval(idxLevel).leftIndex,
// getWorkingInterval(idxLevel).rightIndex,
// idxLevel+1, getWorkingInterval(idxLevel+1).leftIndex,
// getWorkingInterval(idxLevel+1).rightIndex,
// idxLevel, getWorkingInterval(idxLevel,idProcess-1).leftIndex,
// getWorkingInterval(idxLevel,idProcess-1).rightIndex,
// idxLevel+1,getWorkingInterval(idxLevel+1,idProcess-1).leftIndex,
// getWorkingInterval(idxLevel+1,idProcess-1).rightIndex,iterArray[0].getCurrentGlobalIndex());
// }
// Need to compute
FLOG(computationCounter.tic());
iterArray[firstCellWork].getCurrentCell()->deserializeDown(recvBuffer);
kernels[threadNumber]->L2L( iterArray[firstCellWork].getCurrentCell() , iterArray[firstCellWork].getCurrentChild(), idxLevel);
FLOG(computationCounter.tac());
}
} }
}//End Of Single Section if(hasToReceive){
FLOG(prepareCounter.tac()); FAssertLF(iterRequests != 0);
iterArray[nbCellsToSkip-1].getCurrentCell()->deserializeDown(recvBuffer);
kernels[threadNumber]->L2L( iterArray[nbCellsToSkip-1].getCurrentCell() , iterArray[nbCellsToSkip-1].getCurrentChild(), idxLevel);
}
FLOG(prepareCounter.tac());
}
FLOG(computationCounter.tic()); #pragma omp single nowait
//#pragma omp parallel {
FLOG(computationCounter.tic());
}
#pragma omp for nowait #pragma omp for nowait
for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){ for(int idxCell = nbCellsToSkip ; idxCell < totalNbCellsAtLevel ; ++idxCell){
myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel); myThreadkernels->L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
} }
}//End of parallel section }
FLOG(computationCounter.tac()); FLOG(computationCounter.tac());
sendBuffer.reset(); sendBuffer.reset();
...@@ -1852,21 +1094,6 @@ private: ...@@ -1852,21 +1094,6 @@ private:
FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ ); FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ );
FLOG(gatherCounter.tac()); FLOG(gatherCounter.tac());
{//TODO : remove
//Print the globalReceiveMap for Process 0
// if(idProcess == 0)
// {
// printf("\n Proc 0 :: \n");
// for(int u = 0 ; u < nbProcess ; ++u){
// for(int v = 0 ; v < nbProcess ; ++v){
// printf("\t %d",globalReceiveMap[u*nbProcess+v]);
// }
// printf("\n");
// }
// }
}
//Prepare receive //Prepare receive
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
...@@ -2288,23 +1515,6 @@ private: ...@@ -2288,23 +1515,6 @@ private:
FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ ); FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ );
FLOG(gatherCounter.tac()); FLOG(gatherCounter.tac());
{//TODO : remove
//Print the globalReceiveMap for Process 0
if(idProcess == 0)
{
printf("\n Proc 0 :: \n");
for(int u = 0 ; u < nbProcess ; ++u){
for(int v = 0 ; v < nbProcess ; ++v){
printf("\t %d",globalReceiveMap[u*nbProcess+v]);
}
printf("\n");
}
}
}
//Prepare receive //Prepare receive
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){ for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me. if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
...@@ -2391,14 +1601,7 @@ private: ...@@ -2391,14 +1601,7 @@ private:
for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){ for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
const int endAtThisShape = shapeLeaf[idxShape] + previous; const int endAtThisShape = shapeLeaf[idxShape] + previous;
// //#pragma omp single
// {
// char file[5];
// sprintf(file,"re%d",idProcess);
// FILE * fd = fopen(file,"a+");
// fprintf(fd,"Proc %d \t, Color : %d, from %d to %d, total %d \n",idProcess,idxShape,previous,endAtThisShape,endAtThisShape-previous);
// fclose(fd);
// }
#pragma omp for schedule(auto) #pragma omp for schedule(auto)
for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; ++idxLeafs){ for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; ++idxLeafs){
LeafData& currentIter = leafsDataArray[idxLeafs]; LeafData& currentIter = leafsDataArray[idxLeafs];
......
...@@ -58,27 +58,27 @@ public: ...@@ -58,27 +58,27 @@ public:
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
enum FMpiTag { enum FMpiTag {
// FMpiTreeBuilder // FMpiTreeBuilder
TagExchangeIndexs, TagExchangeIndexs = 100,
TagSplittedLeaf, TagSplittedLeaf = 200,
TagExchangeNbLeafs, TagExchangeNbLeafs = 300,
TagSandSettling, TagSandSettling = 400,
// FQuickSort // FQuickSort
TagQuickSort, TagQuickSort = 500,
// FMM // FMM
TagFmmM2M, TagFmmM2M = 1000,
TagFmmL2L, TagFmmL2L = 2000,
TagFmmP2P, TagFmmP2P = 3000,
// Bitonic, // Bitonic,
TagBitonicMin, TagBitonicMin = 4000,
TagBitonicMax, TagBitonicMax = 5000,
TagBitonicMinMess, TagBitonicMinMess = 6000,
TagBitonicMaxMess, TagBitonicMaxMess = 7000,
// Last defined tag // Last defined tag
TagLast, TagLast = 8000,
}; };
//////////////////////////////////////////////////////// ////////////////////////////////////////////////////////
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment