Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
solverstack
ScalFMM
Commits
31427e57
Commit
31427e57
authored
Oct 28, 2013
by
PIACIBELLO Cyrille
Browse files
FFmmAlgorithmProc fonctionne à nouveau
parent
499701b4
Changes
1
Show whitespace changes
Inline
Side-by-side
Src/Core/FFmmAlgorithmThreadProc.hpp
View file @
31427e57
...
...
@@ -38,25 +38,25 @@
#include "FCoreCommon.hpp"
/**
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
* This class is a threaded FMM algorithm with mpi.
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
*/
* @author Berenger Bramas (berenger.bramas@inria.fr)
* @class FFmmAlgorithmThreadProc
* @brief
* Please read the license
*
* This class is a threaded FMM algorithm with mpi.
* It just iterates on a tree and call the kernels with good arguments.
* It used the inspector-executor model :
* iterates on the tree and builds an array to work in parallel on this array
*
* Of course this class does not deallocate pointer given in arguements.
*
* Threaded & based on the inspector-executor model
* schedule(runtime) export OMP_NUM_THREADS=2
* export OMPI_CXX=`which g++-4.4`
* mpirun -np 2 valgrind --suppressions=/usr/share/openmpi/openmpi-valgrind.supp
* --tool=memcheck --leak-check=yes --show-reachable=yes --num-callers=20 --track-fds=yes
* ./Tests/testFmmAlgorithmProc ../Data/testLoaderSmall.fma.tmp
*/
template
<
class
OctreeClass
,
class
CellClass
,
class
ContainerClass
,
class
KernelClass
,
class
LeafClass
>
class
FFmmAlgorithmThreadProc
:
protected
FAssertable
,
public
FAbstractAlgorithm
{
...
...
@@ -67,7 +67,7 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
const
FMpi
::
FComm
&
comm
;
//< MPI comm
typename
OctreeClass
::
Iterator
*
iterArray
;
typename
OctreeClass
::
Iterator
*
iterArray
;
//
int
numberOfLeafs
;
//< To store the size at the previous level
const
int
MaxThreads
;
//< the max number of thread allowed by openmp
...
...
@@ -75,7 +75,7 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
const
int
nbProcess
;
//< Number of process
const
int
idProcess
;
//< Id of current process
const
int
OctreeHeight
;
const
int
OctreeHeight
;
//<Height of the tree
/** An interval is the morton index interval
* that a proc use (it holds data in this interval)
...
...
@@ -96,7 +96,6 @@ class FFmmAlgorithmThreadProc : protected FAssertable , public FAbstractAlgorith
public:
/** Get current proc interval at level */
Interval
&
getWorkingInterval
(
int
level
){
return
getWorkingInterval
(
level
,
idProcess
);
...
...
@@ -178,6 +177,7 @@ public:
octreeIterator
.
gotoBottomLeft
();
octreeIterator
.
moveUp
();
//Da fck is dat ?!
MortonIndex
currentLimit
=
intervals
[
idProcess
-
1
].
max
>>
3
;
for
(
int
idxLevel
=
OctreeHeight
-
2
;
idxLevel
>=
1
;
--
idxLevel
){
...
...
@@ -234,10 +234,10 @@ private:
}
while
(
octreeIterator
.
moveRight
());
FLOG
(
FTic
computationCounter
);
#pragma omp parallel
#pragma omp parallel
{
KernelClass
*
const
myThreadkernels
=
kernels
[
omp_get_thread_num
()];
#pragma omp for nowait
#pragma omp for nowait
for
(
int
idxLeafs
=
0
;
idxLeafs
<
leafs
;
++
idxLeafs
){
myThreadkernels
->
P2M
(
iterArray
[
idxLeafs
].
getCurrentCell
()
,
iterArray
[
idxLeafs
].
getCurrentListSrc
());
}
...
...
@@ -373,10 +373,10 @@ private:
// Compute
const
int
endIndex
=
(
hasToReceive
?
numberOfCells
-
1
:
numberOfCells
);
FLOG
(
computationCounter
.
tic
());
#pragma omp parallel
#pragma omp parallel
{
KernelClass
&
myThreadkernels
=
(
*
kernels
[
omp_get_thread_num
()]);
#pragma omp for nowait
#pragma omp for nowait
for
(
int
idxCell
=
cellsToSend
+
1
;
idxCell
<
endIndex
;
++
idxCell
){
myThreadkernels
.
M2M
(
iterArray
[
idxCell
].
getCurrentCell
()
,
iterArray
[
idxCell
].
getCurrentChild
(),
idxLevel
);
}
...
...
@@ -646,12 +646,12 @@ private:
octreeIterator
=
avoidGotoLeftIterator
;
FLOG
(
computationCounter
.
tic
());
#pragma omp parallel
#pragma omp parallel
{
KernelClass
*
const
myThreadkernels
=
kernels
[
omp_get_thread_num
()];
const
CellClass
*
neighbors
[
343
];
#pragma omp for schedule(dynamic) nowait
#pragma omp for schedule(dynamic) nowait
for
(
int
idxCell
=
0
;
idxCell
<
numberOfCells
;
++
idxCell
){
const
int
counter
=
tree
->
getInteractionNeighbors
(
neighbors
,
iterArray
[
idxCell
].
getCurrentGlobalCoordinate
(),
idxLevel
);
if
(
counter
)
myThreadkernels
->
M2L
(
iterArray
[
idxCell
].
getCurrentCell
()
,
neighbors
,
counter
,
idxLevel
);
...
...
@@ -728,14 +728,14 @@ private:
// Compute this cells
FLOG
(
computationCounter
.
tic
());
#pragma omp parallel
#pragma omp parallel
{
KernelClass
*
const
myThreadkernels
=
kernels
[
omp_get_thread_num
()];
MortonIndex
neighborsIndex
[
189
];
int
neighborsPosition
[
189
];
const
CellClass
*
neighbors
[
343
];
#pragma omp for schedule(dynamic) nowait
#pragma omp for schedule(dynamic) nowait
for
(
int
idxCell
=
0
;
idxCell
<
numberOfCells
;
++
idxCell
){
// compute indexes
memset
(
neighbors
,
0
,
343
*
sizeof
(
CellClass
*
));
...
...
@@ -885,10 +885,10 @@ private:
FLOG
(
prepareCounter
.
tac
());
FLOG
(
computationCounter
.
tic
());
#pragma omp parallel
#pragma omp parallel
{
KernelClass
&
myThreadkernels
=
(
*
kernels
[
omp_get_thread_num
()]);
#pragma omp for nowait
#pragma omp for nowait
for
(
int
idxCell
=
firstCellWork
+
1
;
idxCell
<
numberOfCells
;
++
idxCell
){
myThreadkernels
.
L2L
(
iterArray
[
idxCell
].
getCurrentCell
()
,
iterArray
[
idxCell
].
getCurrentChild
(),
idxLevel
);
}
...
...
@@ -962,6 +962,10 @@ private:
FBufferReader
**
const
recvBuffer
=
new
FBufferReader
*
[
nbProcess
];
memset
(
recvBuffer
,
0
,
sizeof
(
FBufferReader
*
)
*
nbProcess
);
/* This a nbProcess x nbProcess matrix of integer
* let U and V be id of processes :
* globalReceiveMap[U*nbProcess + V] == size of information needed by V and own by U
*/
int
*
const
globalReceiveMap
=
new
int
[
nbProcess
*
nbProcess
];
memset
(
globalReceiveMap
,
0
,
sizeof
(
int
)
*
nbProcess
*
nbProcess
);
...
...
@@ -980,41 +984,45 @@ private:
}
while
(
octreeIterator
.
moveRight
());
}
//
Box limite
//
Number of cells max
const
int
limite
=
1
<<
(
this
->
OctreeHeight
-
1
);
// pointer to send
FVector
<
typename
OctreeClass
::
Iterator
>*
const
toSend
=
new
FVector
<
typename
OctreeClass
::
Iterator
>
[
nbProcess
];
// index
// array that will be send to other processus for them to build the globalReceiveMap
int
partsToSend
[
nbProcess
];
memset
(
partsToSend
,
0
,
sizeof
(
int
)
*
nbProcess
);
// To know if a leaf has been already sent to a proc
int
alreadySent
[
nbProcess
];
//Will store the indexes of the neighbors of current cell
MortonIndex
indexesNeighbors
[
26
];
//Obviously unused
int
uselessIndexArray
[
26
];
for
(
int
idxLeaf
=
0
;
idxLeaf
<
this
->
numberOfLeafs
;
++
idxLeaf
){
memset
(
alreadySent
,
0
,
sizeof
(
int
)
*
nbProcess
);
bool
needOther
=
false
;
const
int
neighCount
=
getNeighborsIndexes
(
iterArray
[
idxLeaf
].
getCurrentGlobalCoordinate
(),
limite
,
indexesNeighbors
,
uselessIndexArray
);
//Get the neighbors of current cell in indexesNeighbors, and their number in neighCount
const
int
neighCount
=
getNeighborsIndexes
(
iterArray
[
idxLeaf
].
getCurrentGlobalCoordinate
(),
limite
,
indexesNeighbors
,
uselessIndexArray
);
//Loop over the neighbor leafs
for
(
int
idxNeigh
=
0
;
idxNeigh
<
neighCount
;
++
idxNeigh
){
//Test if leaf belongs to someone else (false if it's mine)
if
(
indexesNeighbors
[
idxNeigh
]
<
(
intervals
[
idProcess
].
min
)
||
(
intervals
[
idProcess
].
max
)
<
indexesNeighbors
[
idxNeigh
]){
needOther
=
true
;
// find the proc that
need this information
// find the proc that
will need current leaf
int
procToReceive
=
idProcess
;
while
(
procToReceive
!=
0
&&
indexesNeighbors
[
idxNeigh
]
<
intervals
[
procToReceive
].
min
){
--
procToReceive
;
--
procToReceive
;
//scroll process "before" current process
}
while
(
procToReceive
!=
nbProcess
-
1
&&
(
intervals
[
procToReceive
].
max
)
<
indexesNeighbors
[
idxNeigh
]){
++
procToReceive
;
++
procToReceive
;
//scroll process "after" current process
}
// Test : Not Already Send && USELESS TEST ?
if
(
!
alreadySent
[
procToReceive
]
&&
intervals
[
procToReceive
].
min
<=
indexesNeighbors
[
idxNeigh
]
&&
indexesNeighbors
[
idxNeigh
]
<=
intervals
[
procToReceive
].
max
){
alreadySent
[
procToReceive
]
=
1
;
...
...
@@ -1025,25 +1033,43 @@ private:
}
}
if
(
needOther
){
if
(
needOther
){
//means that something need to be sent (or received)
leafsNeedOther
.
set
(
idxLeaf
,
true
);
++
countNeedOther
;
}
}
// No idea why it is mandatory there, could it be a few line before,
for
(
int
idxProc
=
0
;
idxProc
<
nbProcess
;
++
idxProc
){
if
(
partsToSend
[
idxProc
]){
partsToSend
[
idxProc
]
+=
int
(
sizeof
(
int
));
}
}
//Share to all processus globalReceiveMap
FLOG
(
gatherCounter
.
tic
());
FMpi
::
MpiAssert
(
MPI_Allgather
(
partsToSend
,
nbProcess
,
MPI_INT
,
globalReceiveMap
,
nbProcess
,
MPI_INT
,
comm
.
getComm
()),
__LINE__
);
FLOG
(
gatherCounter
.
tac
());
{
//TODO : remove
//Print the globalReceiveMap for Process 0
if
(
idProcess
==
0
)
{
printf
(
"
\n
Proc 0 ::
\n
"
);
for
(
int
u
=
0
;
u
<
nbProcess
;
++
u
){
for
(
int
v
=
0
;
v
<
nbProcess
;
++
v
){
printf
(
"
\t
%d"
,
globalReceiveMap
[
u
*
nbProcess
+
v
]);
}
printf
(
"
\n
"
);
}
}
}
// Prepare receive
for
(
int
idxProc
=
0
;
idxProc
<
nbProcess
;
++
idxProc
){
if
(
globalReceiveMap
[
idxProc
*
nbProcess
+
idProcess
]){
if
(
globalReceiveMap
[
idxProc
*
nbProcess
+
idProcess
]){
//if idxProc has sth for me.
//allocate buffer of right size
recvBuffer
[
idxProc
]
=
new
FBufferReader
(
globalReceiveMap
[
idxProc
*
nbProcess
+
idProcess
]);
FMpi
::
MpiAssert
(
MPI_Irecv
(
recvBuffer
[
idxProc
]
->
data
(),
recvBuffer
[
idxProc
]
->
getSize
(),
MPI_BYTE
,
idxProc
,
FMpi
::
TagFmmP2P
,
comm
.
getComm
(),
&
requests
[
iterRequest
++
])
,
__LINE__
);
...
...
@@ -1054,20 +1080,31 @@ private:
// Prepare send
for
(
int
idxProc
=
0
;
idxProc
<
nbProcess
;
++
idxProc
){
if
(
toSend
[
idxProc
].
getSize
()
!=
0
){
sendBuffer
[
idxProc
]
=
new
FBufferWriter
(
partsToSend
[
idxProc
]);
//sendBuffer[idxProc] = new FBufferWriter(partsToSend[idxProc]); //Could be read out of globalReceiveMap
sendBuffer
[
idxProc
]
=
new
FBufferWriter
(
globalReceiveMap
[
idProcess
*
nbProcess
+
idxProc
]);
// << is equivalent of write().
(
*
sendBuffer
[
idxProc
])
<<
toSend
[
idxProc
].
getSize
();
if
(
idProcess
==
0
)
{
printf
(
"Proc 0 :: toSend[1].getSize()==%d
\n
"
,
toSend
[
1
].
getSize
());
}
for
(
int
idxLeaf
=
0
;
idxLeaf
<
toSend
[
idxProc
].
getSize
()
;
++
idxLeaf
){
(
*
sendBuffer
[
idxProc
])
<<
toSend
[
idxProc
][
idxLeaf
].
getCurrentGlobalIndex
();
toSend
[
idxProc
][
idxLeaf
].
getCurrentListSrc
()
->
save
(
*
sendBuffer
[
idxProc
]);
}
#ifdef FUSE_DEBUG
// TODO clean test
if
(
sendBuffer
[
idxProc
]
->
getSize
()
!=
partsToSend
[
idxProc
]){
printf
(
"Error 1056 fmm algo proc
\n
"
);
}
#endif
//TEST BERENGER
//if(sendBuffer[idxProc]->getSize() != partsToSend[idxProc]){
// {
// MPI_Barrier(MPI_COMM_WORLD);
// printf("Proc %d :: \t sizeof SendBuffer %d \t sizeof partToSend%d \t diff %d \t sizeof RecvBuffer%d\n",
// idProcess,sendBuffer[idxProc]->getSize(),partsToSend[idxProc],sendBuffer[idxProc]->getSize() - partsToSend[idxProc],
// recvBuffer[idxProc]->getSize()+1032);
// MPI_Barrier(MPI_COMM_WORLD);
// }
FMpi
::
MpiAssert
(
MPI_Isend
(
sendBuffer
[
idxProc
]
->
data
(),
sendBuffer
[
idxProc
]
->
getSize
()
,
MPI_BYTE
,
idxProc
,
FMpi
::
TagFmmP2P
,
comm
.
getComm
(),
&
requests
[
iterRequest
++
])
,
__LINE__
);
...
...
@@ -1147,7 +1184,7 @@ private:
FLOG
(
FTic
computationCounter
);
#pragma omp parallel
#pragma omp parallel
{
KernelClass
&
myThreadkernels
=
(
*
kernels
[
omp_get_thread_num
()]);
// There is a maximum of 26 neighbors
...
...
@@ -1157,7 +1194,7 @@ private:
for
(
int
idxShape
=
0
;
idxShape
<
SizeShape
;
++
idxShape
){
const
int
endAtThisShape
=
shapeLeaf
[
idxShape
]
+
previous
;
#pragma omp for
#pragma omp for
for
(
int
idxLeafs
=
previous
;
idxLeafs
<
endAtThisShape
;
++
idxLeafs
){
LeafData
&
currentIter
=
leafsDataArray
[
idxLeafs
];
myThreadkernels
.
L2P
(
currentIter
.
cell
,
currentIter
.
targets
);
...
...
@@ -1175,7 +1212,7 @@ private:
FTRACE
(
regionP2PTrace
.
end
()
);
//////////////////////////////////////////////////////////
// Wait
send receive
// Waitsend receive
//////////////////////////////////////////////////////////
FLOG
(
FTic
computation2Counter
);
...
...
@@ -1190,6 +1227,7 @@ private:
// Wait data
FLOG
(
waitCounter
.
tic
());
MPI_Waitsome
(
iterRequest
,
requests
,
&
countMessages
,
indexMessage
,
status
);
FLOG
(
waitCounter
.
tac
());
complete
+=
countMessages
;
...
...
@@ -1218,7 +1256,7 @@ private:
FTRACE
(
FTrace
::
FRegion
regionOtherTrace
(
"Compute P2P Other"
,
__FUNCTION__
,
__FILE__
,
__LINE__
)
);
FLOG
(
computation2Counter
.
tic
()
);
#pragma omp parallel
#pragma omp parallel
{
KernelClass
&
myThreadkernels
=
(
*
kernels
[
omp_get_thread_num
()]);
// There is a maximum of 26 neighbors
...
...
@@ -1229,7 +1267,7 @@ private:
const
int
limite
=
1
<<
(
this
->
OctreeHeight
-
1
);
const
int
nbLeafToProceed
=
leafsNeedOtherData
.
getSize
();
#pragma omp for
#pragma omp for
for
(
int
idxLeafs
=
0
;
idxLeafs
<
nbLeafToProceed
;
++
idxLeafs
){
LeafData
currentIter
=
leafsNeedOtherData
[
idxLeafs
];
...
...
@@ -1275,7 +1313,12 @@ private:
}
/* @brief Compute the cells in neighborhood of a given cell
* @param center cell which neigbors we are looking for
* @param limite
* @param indexes target array to store the MortonIndexes computed
* @param indexInArray store
*/
int
getNeighborsIndexes
(
const
FTreeCoordinate
&
center
,
const
int
limite
,
MortonIndex
indexes
[
26
],
int
indexInArray
[
26
])
const
{
int
idxNeig
=
0
;
// We test all cells around
...
...
@@ -1290,7 +1333,7 @@ private:
// if we are not on the current cell
if
(
idxX
||
idxY
||
idxZ
){
const
FTreeCoordinate
other
(
center
.
getX
()
+
idxX
,
center
.
getY
()
+
idxY
,
center
.
getZ
()
+
idxZ
);
const
FTreeCoordinate
other
(
center
.
getX
()
+
idxX
,
center
.
getY
()
+
idxY
,
center
.
getZ
()
+
idxZ
);
indexes
[
idxNeig
]
=
other
.
getMortonIndex
(
this
->
OctreeHeight
-
1
);
indexInArray
[
idxNeig
]
=
((
idxX
+
1
)
*
3
+
(
idxY
+
1
))
*
3
+
(
idxZ
+
1
);
++
idxNeig
;
...
...
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment