Commit 31427e57 authored by PIACIBELLO Cyrille's avatar PIACIBELLO Cyrille
Browse files

FFmmAlgorithmProc fonctionne à nouveau

parent 499701b4
......@@ -38,25 +38,25 @@
#include "FCoreCommon.hpp"
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
* This class is a threaded FMM algorithm with mpi.
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
*/
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
* This class is a threaded FMM algorithm with mpi.
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
*/
template<class OctreeClass, class CellClass, class ContainerClass, class KernelClass, class LeafClass>
class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorithm {
......@@ -67,7 +67,7 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
const FMpi::FComm& comm; //< MPI comm
typename OctreeClass::Iterator* iterArray;
typename OctreeClass::Iterator* iterArray; //
int numberOfLeafs; //< To store the size at the previous level
const int MaxThreads; //< the max number of thread allowed by openmp
......@@ -75,7 +75,7 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
const int nbProcess; //< Number of process
const int idProcess; //< Id of current process
const int OctreeHeight;
const int OctreeHeight; //<Height of the tree
/** An interval is the morton index interval
* that a proc use (it holds data in this interval)
......@@ -96,7 +96,6 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
public:
/** Get current proc interval at level */
Interval& getWorkingInterval( int level){
return getWorkingInterval(level, idProcess);
......@@ -178,6 +177,7 @@ public:
octreeIterator.gotoBottomLeft();
octreeIterator.moveUp();
//Da fck is dat ?!
MortonIndex currentLimit = intervals[idProcess-1].max >> 3;
for(int idxLevel = OctreeHeight - 2 ; idxLevel >= 1 ; --idxLevel){
......@@ -234,10 +234,10 @@ private:
} while(octreeIterator.moveRight());
FLOG(FTic computationCounter);
#pragma omp parallel
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
#pragma omp for nowait
#pragma omp for nowait
for(int idxLeafs = 0 ; idxLeafs < leafs ; ++idxLeafs){
myThreadkernels->P2M( iterArray[idxLeafs].getCurrentCell() , iterArray[idxLeafs].getCurrentListSrc());
}
......@@ -373,10 +373,10 @@ private:
// Compute
const int endIndex = (hasToReceive?numberOfCells-1:numberOfCells);
FLOG(computationCounter.tic());
#pragma omp parallel
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for nowait
#pragma omp for nowait
for( int idxCell = cellsToSend + 1 ; idxCell < endIndex ; ++idxCell){
myThreadkernels.M2M( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
}
......@@ -646,12 +646,12 @@ private:
octreeIterator = avoidGotoLeftIterator;
FLOG(computationCounter.tic());
#pragma omp parallel
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
const CellClass* neighbors[343];
#pragma omp for schedule(dynamic) nowait
#pragma omp for schedule(dynamic) nowait
for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
const int counter = tree->getInteractionNeighbors(neighbors, iterArray[idxCell].getCurrentGlobalCoordinate(), idxLevel);
if(counter) myThreadkernels->M2L( iterArray[idxCell].getCurrentCell() , neighbors, counter, idxLevel);
......@@ -728,14 +728,14 @@ private:
// Compute this cells
FLOG(computationCounter.tic());
#pragma omp parallel
#pragma omp parallel
{
KernelClass * const myThreadkernels = kernels[omp_get_thread_num()];
MortonIndex neighborsIndex[189];
int neighborsPosition[189];
const CellClass* neighbors[343];
#pragma omp for schedule(dynamic) nowait
#pragma omp for schedule(dynamic) nowait
for(int idxCell = 0 ; idxCell < numberOfCells ; ++idxCell){
// compute indexes
memset(neighbors, 0, 343 * sizeof(CellClass*));
......@@ -885,10 +885,10 @@ private:
FLOG(prepareCounter.tac());
FLOG(computationCounter.tic());
#pragma omp parallel
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
#pragma omp for nowait
#pragma omp for nowait
for(int idxCell = firstCellWork + 1 ; idxCell < numberOfCells ; ++idxCell){
myThreadkernels.L2L( iterArray[idxCell].getCurrentCell() , iterArray[idxCell].getCurrentChild(), idxLevel);
}
......@@ -962,6 +962,10 @@ private:
FBufferReader**const recvBuffer = new FBufferReader*[nbProcess];
memset(recvBuffer, 0, sizeof(FBufferReader*) * nbProcess);
/* This a nbProcess x nbProcess matrix of integer
* let U and V be id of processes :
* globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
*/
int*const globalReceiveMap = new int[nbProcess * nbProcess];
memset(globalReceiveMap, 0, sizeof(int) * nbProcess * nbProcess);
......@@ -980,41 +984,45 @@ private:
} while(octreeIterator.moveRight());
}
// Box limite
// Number of cells max
const int limite = 1 << (this->OctreeHeight - 1);
// pointer to send
FVector<typename OctreeClass::Iterator>*const toSend = new FVector<typename OctreeClass::Iterator>[nbProcess];
// index
// array that will be send to other processus for them to build the globalReceiveMap
int partsToSend[nbProcess];
memset(partsToSend, 0, sizeof(int) * nbProcess);
// To know if a leaf has been already sent to a proc
int alreadySent[nbProcess];
//Will store the indexes of the neighbors of current cell
MortonIndex indexesNeighbors[26];
//Obviously unused
int uselessIndexArray[26];
for(int idxLeaf = 0 ; idxLeaf < this->numberOfLeafs ; ++idxLeaf){
memset(alreadySent, 0, sizeof(int) * nbProcess);
bool needOther = false;
const int neighCount = getNeighborsIndexes(iterArray[idxLeaf].getCurrentGlobalCoordinate(), limite, indexesNeighbors, uselessIndexArray);
//Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
const int neighCount = getNeighborsIndexes(iterArray[idxLeaf].getCurrentGlobalCoordinate(), limite,
indexesNeighbors, uselessIndexArray);
//Loop over the neighbor leafs
for(int idxNeigh = 0 ; idxNeigh < neighCount ; ++idxNeigh){
//Test if leaf belongs to someone else (false if it's mine)
if(indexesNeighbors[idxNeigh] < (intervals[idProcess].min) || (intervals[idProcess].max) < indexesNeighbors[idxNeigh]){
needOther = true;
// find the proc that need this information
// find the proc that will need current leaf
int procToReceive = idProcess;
while( procToReceive != 0 && indexesNeighbors[idxNeigh] < intervals[procToReceive].min){
--procToReceive;
--procToReceive; //scroll process "before" current process
}
while( procToReceive != nbProcess - 1 && (intervals[procToReceive].max) < indexesNeighbors[idxNeigh]){
++procToReceive;
++procToReceive;//scroll process "after" current process
}
// Test : Not Already Send && USELESS TEST ?
if( !alreadySent[procToReceive] && intervals[procToReceive].min <= indexesNeighbors[idxNeigh] && indexesNeighbors[idxNeigh] <= intervals[procToReceive].max){
alreadySent[procToReceive] = 1;
......@@ -1025,25 +1033,43 @@ private:
}
}
if(needOther){
if(needOther){ //means that something need to be sent (or received)
leafsNeedOther.set(idxLeaf,true);
++countNeedOther;
}
}
// No idea why it is mandatory there, could it be a few line before,
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
if(partsToSend[idxProc]){
partsToSend[idxProc] += int(sizeof(int));
}
}
//Share to all processus globalReceiveMap
FLOG(gatherCounter.tic());
FMpi::MpiAssert( MPI_Allgather( partsToSend, nbProcess, MPI_INT, globalReceiveMap, nbProcess, MPI_INT, comm.getComm()), __LINE__ );
FLOG(gatherCounter.tac());
{//TODO : remove
//Print the globalReceiveMap for Process 0
if(idProcess == 0)
{
printf("\n Proc 0 :: \n");
for(int u = 0 ; u < nbProcess ; ++u){
for(int v = 0 ; v < nbProcess ; ++v){
printf("\t %d",globalReceiveMap[u*nbProcess+v]);
}
printf("\n");
}
}
}
// Prepare receive
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
if(globalReceiveMap[idxProc * nbProcess + idProcess]){
if(globalReceiveMap[idxProc * nbProcess + idProcess]){ //if idxProc has sth for me.
//allocate buffer of right size
recvBuffer[idxProc] = new FBufferReader(globalReceiveMap[idxProc * nbProcess + idProcess]);
FMpi::MpiAssert( MPI_Irecv(recvBuffer[idxProc]->data(), recvBuffer[idxProc]->getSize(), MPI_BYTE,
idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
......@@ -1054,20 +1080,31 @@ private:
// Prepare send
for(int idxProc = 0 ; idxProc < nbProcess ; ++idxProc){
if(toSend[idxProc].getSize() != 0){
sendBuffer[idxProc] = new FBufferWriter(partsToSend[idxProc]);
//sendBuffer[idxProc] = new FBufferWriter(partsToSend[idxProc]); //Could be read out of globalReceiveMap
sendBuffer[idxProc] = new FBufferWriter(globalReceiveMap[idProcess*nbProcess+idxProc]);
// << is equivalent of write().
(*sendBuffer[idxProc]) << toSend[idxProc].getSize();
if(idProcess == 0)
{
printf("Proc 0 :: toSend[1].getSize()==%d\n",toSend[1].getSize());
}
for(int idxLeaf = 0 ; idxLeaf < toSend[idxProc].getSize() ; ++idxLeaf){
(*sendBuffer[idxProc]) << toSend[idxProc][idxLeaf].getCurrentGlobalIndex();
toSend[idxProc][idxLeaf].getCurrentListSrc()->save(*sendBuffer[idxProc]);
}
#ifdef FUSE_DEBUG
// TODO clean test
if(sendBuffer[idxProc]->getSize() != partsToSend[idxProc]){
printf("Error 1056 fmm algo proc\n");
}
#endif
//TEST BERENGER
//if(sendBuffer[idxProc]->getSize() != partsToSend[idxProc]){
// {
// MPI_Barrier(MPI_COMM_WORLD);
// printf("Proc %d :: \t sizeof SendBuffer %d \t sizeof partToSend%d \t diff %d \t sizeof RecvBuffer%d\n",
// idProcess,sendBuffer[idxProc]->getSize(),partsToSend[idxProc],sendBuffer[idxProc]->getSize() - partsToSend[idxProc],
// recvBuffer[idxProc]->getSize()+1032);
// MPI_Barrier(MPI_COMM_WORLD);
// }
FMpi::MpiAssert( MPI_Isend( sendBuffer[idxProc]->data(), sendBuffer[idxProc]->getSize() , MPI_BYTE ,
idxProc, FMpi::TagFmmP2P, comm.getComm(), &requests[iterRequest++]) , __LINE__ );
......@@ -1147,7 +1184,7 @@ private:
FLOG(FTic computationCounter);
#pragma omp parallel
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
// There is a maximum of 26 neighbors
......@@ -1157,7 +1194,7 @@ private:
for(int idxShape = 0 ; idxShape < SizeShape ; ++idxShape){
const int endAtThisShape = shapeLeaf[idxShape] + previous;
#pragma omp for
#pragma omp for
for(int idxLeafs = previous ; idxLeafs < endAtThisShape ; ++idxLeafs){
LeafData& currentIter = leafsDataArray[idxLeafs];
myThreadkernels.L2P(currentIter.cell, currentIter.targets);
......@@ -1175,7 +1212,7 @@ private:
FTRACE( regionP2PTrace.end() );
//////////////////////////////////////////////////////////
// Wait send receive
// Waitsend receive
//////////////////////////////////////////////////////////
FLOG(FTic computation2Counter);
......@@ -1190,6 +1227,7 @@ private:
// Wait data
FLOG(waitCounter.tic());
MPI_Waitsome(iterRequest, requests, &countMessages, indexMessage, status);
FLOG(waitCounter.tac());
complete += countMessages;
......@@ -1218,7 +1256,7 @@ private:
FTRACE( FTrace::FRegion regionOtherTrace("Compute P2P Other", __FUNCTION__ , __FILE__ , __LINE__) );
FLOG( computation2Counter.tic() );
#pragma omp parallel
#pragma omp parallel
{
KernelClass& myThreadkernels = (*kernels[omp_get_thread_num()]);
// There is a maximum of 26 neighbors
......@@ -1229,7 +1267,7 @@ private:
const int limite = 1 << (this->OctreeHeight - 1);
const int nbLeafToProceed = leafsNeedOtherData.getSize();
#pragma omp for
#pragma omp for
for(int idxLeafs = 0 ; idxLeafs < nbLeafToProceed ; ++idxLeafs){
LeafData currentIter = leafsNeedOtherData[idxLeafs];
......@@ -1275,7 +1313,12 @@ private:
}
/* @brief Compute the cells in neighborhood of a given cell
* @param center cell which neigbors we are looking for
* @param limite
* @param indexes target array to store the MortonIndexes computed
* @param indexInArray store
*/
int getNeighborsIndexes(const FTreeCoordinate& center, const int limite, MortonIndex indexes[26], int indexInArray[26]) const{
int idxNeig = 0;
// We test all cells around
......@@ -1290,7 +1333,7 @@ private:
// if we are not on the current cell
if( idxX || idxY || idxZ ){
const FTreeCoordinate other(center.getX() + idxX,center.getY() + idxY,center.getZ() + idxZ);
const FTreeCoordinate other(center.getX() + idxX, center.getY() + idxY, center.getZ() + idxZ);
indexes[ idxNeig ] = other.getMortonIndex(this->OctreeHeight - 1);
indexInArray[ idxNeig ] = ((idxX+1)*3 + (idxY+1)) * 3 + (idxZ+1);
++idxNeig;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment